WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] [xen-unstable] Replace shadow pagetable code with shadow

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [xen-unstable] Replace shadow pagetable code with shadow2.
From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Sat, 19 Aug 2006 02:40:30 +0000
Delivery-date: Fri, 18 Aug 2006 19:42:47 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User tdeegan@xxxxxxxxxxxxxxxxxxxxx
# Node ID 0f917d63e9608315e4a925109401e383fc895b2a
# Parent  fda70200da01b89d5339342df6c0db372369a16d
Replace shadow pagetable code with shadow2.
---
 xen/arch/x86/audit.c                     |  984 ------
 xen/arch/x86/shadow.c                    | 4150 ----------------------------
 xen/arch/x86/shadow32.c                  | 3782 --------------------------
 xen/arch/x86/shadow_guest32.c            |   16 
 xen/arch/x86/shadow_guest32pae.c         |   16 
 xen/arch/x86/shadow_public.c             | 2143 --------------
 xen/include/asm-x86/shadow_64.h          |  587 ----
 xen/include/asm-x86/shadow_ops.h         |  138 
 xen/include/asm-x86/shadow_public.h      |   61 
 .hgtags                                  |   10 
 tools/examples/xmexample.hvm             |    4 
 tools/libxc/xc_domain.c                  |   13 
 tools/libxc/xc_hvm_build.c               |   13 
 tools/libxc/xc_linux_build.c             |    2 
 tools/libxc/xc_linux_save.c              |   18 
 tools/libxc/xenctrl.h                    |    2 
 tools/misc/xc_shadow.c                   |    2 
 tools/python/xen/lowlevel/xc/xc.c        |   69 
 tools/python/xen/xend/XendDomain.py      |   24 
 tools/python/xen/xend/XendDomainInfo.py  |   47 
 tools/python/xen/xend/image.py           |   17 
 tools/python/xen/xm/create.py            |    9 
 xen/arch/x86/Makefile                    |   16 
 xen/arch/x86/dom0_ops.c                  |    2 
 xen/arch/x86/domain.c                    |  106 
 xen/arch/x86/domain_build.c              |   13 
 xen/arch/x86/hvm/hvm.c                   |   23 
 xen/arch/x86/hvm/platform.c              |    9 
 xen/arch/x86/hvm/svm/svm.c               |  265 -
 xen/arch/x86/hvm/svm/vmcb.c              |    4 
 xen/arch/x86/hvm/vlapic.c                |    3 
 xen/arch/x86/hvm/vmx/vmcs.c              |   15 
 xen/arch/x86/hvm/vmx/vmx.c               |  228 -
 xen/arch/x86/mm.c                        |  485 +--
 xen/arch/x86/setup.c                     |    2 
 xen/arch/x86/shadow2-common.c            | 3394 +++++++++++++++++++++++
 xen/arch/x86/shadow2.c                   | 4469 +++++++++++++++++++++++++++++++
 xen/arch/x86/smpboot.c                   |    2 
 xen/arch/x86/traps.c                     |   32 
 xen/arch/x86/x86_32/domain_page.c        |   33 
 xen/arch/x86/x86_32/mm.c                 |    3 
 xen/arch/x86/x86_64/mm.c                 |    3 
 xen/arch/x86/x86_64/traps.c              |   14 
 xen/common/acm_ops.c                     |    1 
 xen/common/grant_table.c                 |    4 
 xen/common/keyhandler.c                  |   19 
 xen/common/memory.c                      |   11 
 xen/drivers/char/console.c               |   50 
 xen/include/asm-x86/bitops.h             |   18 
 xen/include/asm-x86/config.h             |   22 
 xen/include/asm-x86/domain.h             |   99 
 xen/include/asm-x86/grant_table.h        |    2 
 xen/include/asm-x86/hvm/hvm.h            |   25 
 xen/include/asm-x86/hvm/support.h        |   11 
 xen/include/asm-x86/hvm/vcpu.h           |    6 
 xen/include/asm-x86/hvm/vmx/vmcs.h       |    1 
 xen/include/asm-x86/hvm/vmx/vmx.h        |   49 
 xen/include/asm-x86/mm.h                 |  140 
 xen/include/asm-x86/msr.h                |    4 
 xen/include/asm-x86/page-guest32.h       |    7 
 xen/include/asm-x86/page.h               |   37 
 xen/include/asm-x86/perfc_defn.h         |   53 
 xen/include/asm-x86/processor.h          |    1 
 xen/include/asm-x86/shadow.h             | 1791 ------------
 xen/include/asm-x86/shadow2-multi.h      |  116 
 xen/include/asm-x86/shadow2-private.h    |  612 ++++
 xen/include/asm-x86/shadow2-types.h      |  705 ++++
 xen/include/asm-x86/shadow2.h            |  627 ++++
 xen/include/asm-x86/x86_32/page-2level.h |    1 
 xen/include/asm-x86/x86_32/page-3level.h |    3 
 xen/include/asm-x86/x86_64/page.h        |    5 
 xen/include/public/dom0_ops.h            |   16 
 xen/include/xen/domain_page.h            |   13 
 xen/include/xen/lib.h                    |    4 
 xen/include/xen/list.h                   |   10 
 xen/include/xen/sched.h                  |    5 
 76 files changed, 11147 insertions(+), 14549 deletions(-)

diff -r fda70200da01 -r 0f917d63e960 .hgtags
--- a/.hgtags   Wed Aug 16 16:16:32 2006 +0100
+++ b/.hgtags   Wed Aug 16 17:02:35 2006 +0100
@@ -15,3 +15,13 @@ c8fdb0caa77b429cf47f9707926e83947778cb48
 c8fdb0caa77b429cf47f9707926e83947778cb48 RELEASE-3.0.0
 af0573e9e5258db0a9d28aa954dd302ddd2c2d23 3.0.2-rc
 d0d3fef37685be264a7f52201f8ef44c030daad3 3.0.2-branched
+6e864d7de9db066f92bea505d256bfe286200fed last-code-review
+a898a6510c5db4e3d1f69d40fcacb540643b0f22 mainline
+bfa6f4a0c594bc0ebd896437d69857b58dab0988 last-code-review
+fc6cbf31bd883bc76ceb97f4b817ac88078d696a latest patch to unstable
+8e55c5c1147589b7a6a1875384d4317aec7ccf84 mainline
+2d2ed4d9b1c14aeee29dfdd77acd6017d31290cd mainline
+0e32095a7b4611d18a82052a9d5b23e474f91af9 mainline
+88e6bd5e2b5439f97e1d50a8724103c619aeaadf mainline
+5233c4b076b9aa073eff63508461b7bfa597737c mainline
+fda70200da01b89d5339342df6c0db372369a16d mainline
diff -r fda70200da01 -r 0f917d63e960 tools/examples/xmexample.hvm
--- a/tools/examples/xmexample.hvm      Wed Aug 16 16:16:32 2006 +0100
+++ b/tools/examples/xmexample.hvm      Wed Aug 16 17:02:35 2006 +0100
@@ -26,6 +26,10 @@ builder='hvm'
 #          memory errors. The domain needs enough memory to boot kernel
 #          and modules. Allocating less than 32MBs is not recommended.
 memory = 128
+
+# Shadow pagetable memory for the domain, in MB.
+# Should be at least 2KB per MB of domain memory, plus a few MB per vcpu.
+shadow_memory = 8
 
 # A name for your domain. All domains must have different names.
 name = "ExampleHVMDomain"
diff -r fda70200da01 -r 0f917d63e960 tools/libxc/xc_domain.c
--- a/tools/libxc/xc_domain.c   Wed Aug 16 16:16:32 2006 +0100
+++ b/tools/libxc/xc_domain.c   Wed Aug 16 17:02:35 2006 +0100
@@ -213,21 +213,28 @@ int xc_shadow_control(int xc_handle,
                       unsigned int sop,
                       unsigned long *dirty_bitmap,
                       unsigned long pages,
-                      xc_shadow_control_stats_t *stats )
+                      unsigned long *mb,
+                      uint32_t mode,
+                      xc_shadow_control_stats_t *stats)
 {
     int rc;
     DECLARE_DOM0_OP;
     op.cmd = DOM0_SHADOW_CONTROL;
     op.u.shadow_control.domain = (domid_t)domid;
     op.u.shadow_control.op     = sop;
+    op.u.shadow_control.pages  = pages;
+    op.u.shadow_control.mb     = mb ? *mb : 0;
+    op.u.shadow_control.mode   = mode;
     set_xen_guest_handle(op.u.shadow_control.dirty_bitmap, dirty_bitmap);
-    op.u.shadow_control.pages  = pages;
 
     rc = do_dom0_op(xc_handle, &op);
 
     if ( stats )
         memcpy(stats, &op.u.shadow_control.stats,
                sizeof(xc_shadow_control_stats_t));
+    
+    if ( mb ) 
+        *mb = op.u.shadow_control.mb;
 
     return (rc == 0) ? op.u.shadow_control.pages : rc;
 }
@@ -391,7 +398,7 @@ int xc_domain_memory_populate_physmap(in
 
     if ( err > 0 )
     {
-        DPRINTF("Failed deallocation for dom %d: %ld pages order %d\n",
+        DPRINTF("Failed allocation for dom %d: %ld pages order %d\n",
                 domid, nr_extents, extent_order);
         errno = EBUSY;
         err = -1;
diff -r fda70200da01 -r 0f917d63e960 tools/libxc/xc_hvm_build.c
--- a/tools/libxc/xc_hvm_build.c        Wed Aug 16 16:16:32 2006 +0100
+++ b/tools/libxc/xc_hvm_build.c        Wed Aug 16 17:02:35 2006 +0100
@@ -395,6 +395,19 @@ static int xc_hvm_build_internal(int xc_
         PERROR("Could not get info on domain");
         goto error_out;
     }
+
+    /* HVM domains must be put into shadow2 mode at the start of day */
+    if ( xc_shadow_control(xc_handle, domid, DOM0_SHADOW2_CONTROL_OP_ENABLE,
+                           NULL, 0, NULL, 
+                           DOM0_SHADOW2_CONTROL_FLAG_ENABLE 
+                           | DOM0_SHADOW2_CONTROL_FLAG_REFCOUNT
+                           | DOM0_SHADOW2_CONTROL_FLAG_TRANSLATE
+                           | DOM0_SHADOW2_CONTROL_FLAG_EXTERNAL, 
+                           NULL) ) 
+    {
+        PERROR("Could not enable shadow paging for domain.\n");
+        goto error_out;
+    }        
 
     memset(ctxt, 0, sizeof(*ctxt));
 
diff -r fda70200da01 -r 0f917d63e960 tools/libxc/xc_linux_build.c
--- a/tools/libxc/xc_linux_build.c      Wed Aug 16 16:16:32 2006 +0100
+++ b/tools/libxc/xc_linux_build.c      Wed Aug 16 17:02:35 2006 +0100
@@ -972,7 +972,7 @@ static int setup_guest(int xc_handle,
         /* Enable shadow translate mode */
         if ( xc_shadow_control(xc_handle, dom,
                                DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE,
-                               NULL, 0, NULL) < 0 )
+                               NULL, 0, NULL, 0, NULL) < 0 )
         {
             PERROR("Could not enable translation mode");
             goto error_out;
diff -r fda70200da01 -r 0f917d63e960 tools/libxc/xc_linux_save.c
--- a/tools/libxc/xc_linux_save.c       Wed Aug 16 16:16:32 2006 +0100
+++ b/tools/libxc/xc_linux_save.c       Wed Aug 16 17:02:35 2006 +0100
@@ -338,13 +338,13 @@ static int analysis_phase(int xc_handle,
         int i;
 
         xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_CLEAN,
-                          arr, max_pfn, NULL);
+                          arr, max_pfn, NULL, 0, NULL);
         DPRINTF("#Flush\n");
         for ( i = 0; i < 40; i++ ) {
             usleep(50000);
             now = llgettimeofday();
             xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_PEEK,
-                              NULL, 0, &stats);
+                              NULL, 0, NULL, 0, &stats);
 
             DPRINTF("now= %lld faults= %" PRId32 " dirty= %" PRId32
                     " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n",
@@ -727,7 +727,7 @@ int xc_linux_save(int xc_handle, int io_
 
         if (xc_shadow_control(xc_handle, dom,
                               DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
-                              NULL, 0, NULL ) < 0) {
+                              NULL, 0, NULL, 0, NULL) < 0) {
             ERR("Couldn't enable shadow mode");
             goto out;
         }
@@ -879,7 +879,7 @@ int xc_linux_save(int xc_handle, int io_
                but this is fast enough for the moment. */
             if (!last_iter && xc_shadow_control(
                     xc_handle, dom, DOM0_SHADOW_CONTROL_OP_PEEK,
-                    to_skip, max_pfn, NULL) != max_pfn) {
+                    to_skip, max_pfn, NULL, 0, NULL) != max_pfn) {
                 ERR("Error peeking shadow bitmap");
                 goto out;
             }
@@ -1084,8 +1084,9 @@ int xc_linux_save(int xc_handle, int io_
                         (unsigned long)ctxt.user_regs.edx);
             }
 
-            if (xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_CLEAN,
-                                  to_send, max_pfn, &stats ) != max_pfn) {
+            if (xc_shadow_control(xc_handle, dom, 
+                                  DOM0_SHADOW_CONTROL_OP_CLEAN, to_send, 
+                                  max_pfn, NULL, 0, &stats) != max_pfn) {
                 ERR("Error flushing shadow PT");
                 goto out;
             }
@@ -1174,8 +1175,9 @@ int xc_linux_save(int xc_handle, int io_
  out:
 
     if (live) {
-        if(xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_OFF,
-                             NULL, 0, NULL ) < 0) {
+        if(xc_shadow_control(xc_handle, dom, 
+                             DOM0_SHADOW_CONTROL_OP_OFF,
+                             NULL, 0, NULL, 0, NULL) < 0) {
             DPRINTF("Warning - couldn't disable shadow mode");
         }
     }
diff -r fda70200da01 -r 0f917d63e960 tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h     Wed Aug 16 16:16:32 2006 +0100
+++ b/tools/libxc/xenctrl.h     Wed Aug 16 17:02:35 2006 +0100
@@ -323,6 +323,8 @@ int xc_shadow_control(int xc_handle,
                       unsigned int sop,
                       unsigned long *dirty_bitmap,
                       unsigned long pages,
+                      unsigned long *mb,
+                      uint32_t mode,
                       xc_shadow_control_stats_t *stats);
 
 int xc_bvtsched_global_set(int xc_handle,
diff -r fda70200da01 -r 0f917d63e960 tools/misc/xc_shadow.c
--- a/tools/misc/xc_shadow.c    Wed Aug 16 16:16:32 2006 +0100
+++ b/tools/misc/xc_shadow.c    Wed Aug 16 17:02:35 2006 +0100
@@ -60,6 +60,8 @@ int main(int argc, char *argv[])
                            mode, 
                            NULL,
                            0,
+                           NULL,
+                           0,
                            NULL) < 0 )
     {    
         fprintf(stderr, "Error reseting performance counters: %d (%s)\n",
diff -r fda70200da01 -r 0f917d63e960 tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Wed Aug 16 16:16:32 2006 +0100
+++ b/tools/python/xen/lowlevel/xc/xc.c Wed Aug 16 17:02:35 2006 +0100
@@ -669,6 +669,59 @@ static PyObject *pyxc_sedf_domain_get(Xc
                          "weight",    weight);
 }
 
+static PyObject *pyxc_shadow_control(PyObject *self,
+                                     PyObject *args,
+                                     PyObject *kwds)
+{
+    XcObject *xc = (XcObject *)self;
+
+    uint32_t dom;
+    int op=0;
+
+    static char *kwd_list[] = { "dom", "op", NULL };
+
+    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "i|i", kwd_list, 
+                                      &dom, &op) )
+        return NULL;
+    
+    if ( xc_shadow_control(xc->xc_handle, dom, op, NULL, 0, NULL, 0, NULL) 
+         < 0 )
+        return PyErr_SetFromErrno(xc_error);
+    
+    Py_INCREF(zero);
+    return zero;
+}
+
+static PyObject *pyxc_shadow_mem_control(PyObject *self,
+                                         PyObject *args,
+                                         PyObject *kwds)
+{
+    XcObject *xc = (XcObject *)self;
+    int op;
+    uint32_t dom;
+    int mbarg = -1;
+    unsigned long mb;
+
+    static char *kwd_list[] = { "dom", "mb", NULL };
+
+    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "i|i", kwd_list, 
+                                      &dom, &mbarg) )
+        return NULL;
+    
+    if ( mbarg < 0 ) 
+        op = DOM0_SHADOW2_CONTROL_OP_GET_ALLOCATION;
+    else 
+    {
+        mb = mbarg;
+        op = DOM0_SHADOW2_CONTROL_OP_SET_ALLOCATION;
+    }
+    if ( xc_shadow_control(xc->xc_handle, dom, op, NULL, 0, &mb, 0, NULL) < 0 )
+        return PyErr_SetFromErrno(xc_error);
+    
+    mbarg = mb;
+    return Py_BuildValue("i", mbarg);
+}
+
 static PyObject *pyxc_sched_credit_domain_set(XcObject *self,
                                               PyObject *args,
                                               PyObject *kwds)
@@ -1118,6 +1171,22 @@ static PyMethodDef pyxc_methods[] = {
       "Get information about the Xen host\n"
       "Returns [dict]: information about Xen"
       "        [None]: on failure.\n" },
+
+    { "shadow_control", 
+      (PyCFunction)pyxc_shadow_control, 
+      METH_VARARGS | METH_KEYWORDS, "\n"
+      "Set parameter for shadow pagetable interface\n"
+      " dom [int]:   Identifier of domain.\n"
+      " op [int, 0]: operation\n\n"
+      "Returns: [int] 0 on success; -1 on error.\n" },
+
+    { "shadow_mem_control", 
+      (PyCFunction)pyxc_shadow_mem_control, 
+      METH_VARARGS | METH_KEYWORDS, "\n"
+      "Set or read shadow pagetable memory use\n"
+      " dom [int]:   Identifier of domain.\n"
+      " mb [int, -1]: MB of shadow memory this domain should have.\n\n"
+      "Returns: [int] MB of shadow memory in use by this domain.\n" },
 
     { "domain_setmaxmem", 
       (PyCFunction)pyxc_domain_setmaxmem, 
diff -r fda70200da01 -r 0f917d63e960 tools/python/xen/xend/XendDomain.py
--- a/tools/python/xen/xend/XendDomain.py       Wed Aug 16 16:16:32 2006 +0100
+++ b/tools/python/xen/xend/XendDomain.py       Wed Aug 16 17:02:35 2006 +0100
@@ -532,6 +532,30 @@ class XendDomain:
         except Exception, ex:
             raise XendError(str(ex))
 
+    def domain_shadow_control(self, domid, op):
+        """Shadow page control."""
+        dominfo = self.domain_lookup(domid)
+        try:
+            return xc.shadow_control(dominfo.getDomid(), op)
+        except Exception, ex:
+            raise XendError(str(ex))
+
+    def domain_shadow_mem_get(self, domid):
+        """Get shadow pagetable memory allocation."""
+        dominfo = self.domain_lookup(domid)
+        try:
+            return xc.shadow_mem_control(dominfo.getDomid())
+        except Exception, ex:
+            raise XendError(str(ex))
+
+    def domain_shadow_mem_set(self, domid, mb):
+        """Set shadow pagetable memory allocation."""
+        dominfo = self.domain_lookup(domid)
+        try:
+            return xc.shadow_mem_control(dominfo.getDomid(), mb=mb)
+        except Exception, ex:
+            raise XendError(str(ex))
+
     def domain_sched_credit_get(self, domid):
         """Get credit scheduler parameters for a domain.
         """
diff -r fda70200da01 -r 0f917d63e960 tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py   Wed Aug 16 16:16:32 2006 +0100
+++ b/tools/python/xen/xend/XendDomainInfo.py   Wed Aug 16 17:02:35 2006 +0100
@@ -30,6 +30,7 @@ import time
 import time
 import threading
 import os
+import math
 
 import xen.lowlevel.xc
 from xen.util import asserts
@@ -126,16 +127,17 @@ VM_CONFIG_PARAMS = [
 # don't come out of xc in the same form as they are specified in the config
 # file, so those are handled separately.
 ROUNDTRIPPING_CONFIG_ENTRIES = [
-    ('uuid',       str),
-    ('vcpus',      int),
-    ('vcpu_avail', int),
-    ('cpu_weight', float),
-    ('memory',     int),
-    ('maxmem',     int),
-    ('bootloader', str),
+    ('uuid',            str),
+    ('vcpus',           int),
+    ('vcpu_avail',      int),
+    ('cpu_weight',      float),
+    ('memory',          int),
+    ('shadow_memory',   int),
+    ('maxmem',          int),
+    ('bootloader',      str),
     ('bootloader_args', str),
-    ('features', str),
-    ('localtime', int),
+    ('features',        str),
+    ('localtime',       int),
     ]
 
 ROUNDTRIPPING_CONFIG_ENTRIES += VM_CONFIG_PARAMS
@@ -146,12 +148,13 @@ ROUNDTRIPPING_CONFIG_ENTRIES += VM_CONFI
 # entries written to the store that cannot be reconfigured on-the-fly.
 #
 VM_STORE_ENTRIES = [
-    ('uuid',       str),
-    ('vcpus',      int),
-    ('vcpu_avail', int),
-    ('memory',     int),
-    ('maxmem',     int),
-    ('start_time', float),
+    ('uuid',          str),
+    ('vcpus',         int),
+    ('vcpu_avail',    int),
+    ('memory',        int),
+    ('shadow_memory', int),
+    ('maxmem',        int),
+    ('start_time',    float),
     ]
 
 VM_STORE_ENTRIES += VM_CONFIG_PARAMS
@@ -572,6 +575,7 @@ class XendDomainInfo:
             defaultInfo('vcpu_avail',   lambda: (1 << self.info['vcpus']) - 1)
 
             defaultInfo('memory',       lambda: 0)
+            defaultInfo('shadow_memory', lambda: 0)
             defaultInfo('maxmem',       lambda: 0)
             defaultInfo('bootloader',   lambda: None)
             defaultInfo('bootloader_args', lambda: None)            
@@ -1280,7 +1284,18 @@ class XendDomainInfo:
             xc.domain_setmaxmem(self.domid, self.info['maxmem'] * 1024)
 
             m = self.image.getDomainMemory(self.info['memory'] * 1024)
-            balloon.free(m)
+
+            # get the domain's shadow memory requirement
+            sm = int(math.ceil(self.image.getDomainShadowMemory(m) / 1024.0))
+            if self.info['shadow_memory'] > sm:
+                sm = self.info['shadow_memory']
+
+            # Make sure there's enough RAM available for the domain
+            balloon.free(m + sm * 1024)
+
+            # Set up the shadow memory
+            sm = xc.shadow_mem_control(self.domid, mb=sm)
+            self.info['shadow_memory'] = sm
 
             init_reservation = self.info['memory'] * 1024
             if os.uname()[4] in ('ia64', 'ppc64'):
diff -r fda70200da01 -r 0f917d63e960 tools/python/xen/xend/image.py
--- a/tools/python/xen/xend/image.py    Wed Aug 16 16:16:32 2006 +0100
+++ b/tools/python/xen/xend/image.py    Wed Aug 16 17:02:35 2006 +0100
@@ -152,6 +152,12 @@ class ImageHandler:
             if 'hvm' in xc.xeninfo()['xen_caps']:
                 mem_kb += 4*1024;
         return mem_kb
+
+    def getDomainShadowMemory(self, mem_kb):
+        """@return The minimum shadow memory required, in KiB, for a domain 
+        with mem_kb KiB of RAM."""
+        # PV domains don't need any shadow memory
+        return 0
 
     def buildDomain(self):
         """Build the domain. Define in subclass."""
@@ -364,6 +370,17 @@ class HVMImageHandler(ImageHandler):
             extra_pages = int( math.ceil( extra_mb*1024 / page_kb ))
         return mem_kb + extra_pages * page_kb
 
+    def getDomainShadowMemory(self, mem_kb):
+        """@return The minimum shadow memory required, in KiB, for a domain 
+        with mem_kb KiB of RAM."""
+        if os.uname()[4] in ('ia64', 'ppc64'):
+            # Explicit shadow memory is not a concept 
+            return 0
+        else:
+            # 1MB per vcpu plus 4Kib/Mib of RAM.  This is higher than 
+            # the minimum that Xen would allocate if no value were given.
+            return 1024 * self.vm.getVCpuCount() + mem_kb / 256
+
     def register_shutdown_watch(self):
         """ add xen store watch on control/shutdown """
         self.shutdownWatch = xswatch(self.vm.dompath + "/control/shutdown", \
diff -r fda70200da01 -r 0f917d63e960 tools/python/xen/xm/create.py
--- a/tools/python/xen/xm/create.py     Wed Aug 16 16:16:32 2006 +0100
+++ b/tools/python/xen/xm/create.py     Wed Aug 16 17:02:35 2006 +0100
@@ -157,6 +157,10 @@ gopts.var('maxmem', val='MEMORY',
 gopts.var('maxmem', val='MEMORY',
           fn=set_int, default=None,
           use="Maximum domain memory in MB.")
+
+gopts.var('shadow_memory', val='MEMORY',
+          fn=set_int, default=0,
+          use="Domain shadow memory in MB.")
 
 gopts.var('cpu', val='CPU',
           fn=set_int, default=None,
@@ -666,8 +670,9 @@ def make_config(vals):
             if v:
                 config.append([n, v])
 
-    map(add_conf, ['name', 'memory', 'maxmem', 'restart', 'on_poweroff',
-                   'on_reboot', 'on_crash', 'vcpus', 'features'])
+    map(add_conf, ['name', 'memory', 'maxmem', 'shadow_memory',
+                   'restart', 'on_poweroff', 'on_reboot', 'on_crash',
+                   'vcpus', 'features'])
 
     if vals.uuid is not None:
         config.append(['uuid', vals.uuid])
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/Makefile
--- a/xen/arch/x86/Makefile     Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/Makefile     Wed Aug 16 17:02:35 2006 +0100
@@ -8,7 +8,6 @@ subdir-$(x86_64) += x86_64
 subdir-$(x86_64) += x86_64
 
 obj-y += apic.o
-obj-y += audit.o
 obj-y += bitops.o
 obj-y += compat.o
 obj-y += delay.o
@@ -41,12 +40,21 @@ obj-y += x86_emulate.o
 obj-y += x86_emulate.o
 
 ifneq ($(pae),n)
-obj-$(x86_32) += shadow.o shadow_public.o shadow_guest32.o shadow_guest32pae.o
+obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s3.o shadow2_g3_on_s3.o
 else
-obj-$(x86_32) += shadow32.o
+obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s2.o
 endif
 
-obj-$(x86_64) += shadow.o shadow_public.o shadow_guest32.o shadow_guest32pae.o
+obj-$(x86_64) += shadow2-common.o shadow2_g4_on_s4.o shadow2_g3_on_s3.o \
+                 shadow2_g2_on_s3.o
+
+guest_levels  = $(subst g,,$(filter g%,$(subst ., ,$(subst _, ,$(subst 
shadow2_,,$(1))))))
+shadow_levels = $(subst s,,$(filter s%,$(subst ., ,$(subst _, ,$(subst 
shadow2_,,$(1))))))
+shadow2_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1)) \
+                -DSHADOW_PAGING_LEVELS=$(call shadow_levels,$(1))
+
+shadow2_%.o: shadow2.c $(HDRS) Makefile
+       $(CC) $(CFLAGS) $(call shadow2_defns,$(@F)) -c $< -o $@
 
 obj-$(crash_debug) += gdbstub.o
 
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/dom0_ops.c
--- a/xen/arch/x86/dom0_ops.c   Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/dom0_ops.c   Wed Aug 16 17:02:35 2006 +0100
@@ -89,7 +89,7 @@ long arch_do_dom0_op(struct dom0_op *op,
         d = find_domain_by_id(op->u.shadow_control.domain);
         if ( d != NULL )
         {
-            ret = shadow_mode_control(d, &op->u.shadow_control);
+            ret = shadow2_control_op(d, &op->u.shadow_control, u_dom0_op);
             put_domain(d);
             copy_to_guest(u_dom0_op, op, 1);
         } 
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/domain.c     Wed Aug 16 17:02:35 2006 +0100
@@ -134,13 +134,6 @@ struct vcpu *alloc_vcpu_struct(struct do
     v->arch.perdomain_ptes =
         d->arch.mm_perdomain_pt + (vcpu_id << GDT_LDT_VCPU_SHIFT);
 
-    v->arch.guest_vtable  = __linear_l2_table;
-    v->arch.shadow_vtable = __shadow_linear_l2_table;
-#if defined(__x86_64__)
-    v->arch.guest_vl3table = __linear_l3_table;
-    v->arch.guest_vl4table = __linear_l4_table;
-#endif
-
     pae_l3_cache_init(&v->arch.pae_l3_cache);
 
     return v;
@@ -155,9 +148,7 @@ int arch_domain_create(struct domain *d)
 {
     l1_pgentry_t gdt_l1e;
     int vcpuid, pdpt_order;
-#ifdef __x86_64__
     int i;
-#endif
 
     pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
     d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
@@ -202,8 +193,12 @@ int arch_domain_create(struct domain *d)
 
 #endif /* __x86_64__ */
 
-    shadow_lock_init(d);
-    INIT_LIST_HEAD(&d->arch.free_shadow_frames);
+    shadow2_lock_init(d);
+    for ( i = 0; i <= SHADOW2_MAX_ORDER; i++ )
+        INIT_LIST_HEAD(&d->arch.shadow2_freelists[i]);
+    INIT_LIST_HEAD(&d->arch.shadow2_p2m_freelist);
+    INIT_LIST_HEAD(&d->arch.shadow2_p2m_inuse);
+    INIT_LIST_HEAD(&d->arch.shadow2_toplevel_shadows);
 
     if ( !is_idle_domain(d) )
     {
@@ -234,6 +229,8 @@ int arch_domain_create(struct domain *d)
 
 void arch_domain_destroy(struct domain *d)
 {
+    shadow2_final_teardown(d);
+
     free_xenheap_pages(
         d->arch.mm_perdomain_pt,
         get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
@@ -328,31 +325,35 @@ int arch_set_info_guest(
         if ( !hvm_initialize_guest_resources(v) )
             return -EINVAL;
     }
-    else if ( shadow_mode_refcounts(d) )
-    {
-        if ( !get_page(mfn_to_page(cr3_pfn), d) )
+    else
+    {
+        if ( !get_page_and_type(mfn_to_page(cr3_pfn), d,
+                                PGT_base_page_table) )
         {
             destroy_gdt(v);
             return -EINVAL;
         }
-    }
-    else
-    {
-        if ( !get_page_and_type(mfn_to_page(cr3_pfn), d,
-                                PGT_base_page_table) )
-        {
-            destroy_gdt(v);
-            return -EINVAL;
-        }
-    }
-
-    update_pagetables(v);
+    }    
+
+    /* Shadow2: make sure the domain has enough shadow memory to
+     * boot another vcpu */
+    if ( shadow2_mode_enabled(d) 
+         && d->arch.shadow2_total_pages < shadow2_min_acceptable_pages(d) )
+    {
+        destroy_gdt(v);
+        return -ENOMEM;
+    }
 
     if ( v->vcpu_id == 0 )
         update_domain_wallclock_time(d);
 
     /* Don't redo final setup */
     set_bit(_VCPUF_initialised, &v->vcpu_flags);
+
+    if ( shadow2_mode_enabled(d) )
+        shadow2_update_paging_modes(v);
+
+    update_cr3(v);
 
     return 0;
 }
@@ -669,7 +670,6 @@ static void __context_switch(void)
             loaddebug(&n->arch.guest_context, 6);
             loaddebug(&n->arch.guest_context, 7);
         }
-
         n->arch.ctxt_switch_to(n);
     }
 
@@ -927,29 +927,34 @@ void domain_relinquish_resources(struct 
     /* Drop the in-use references to page-table bases. */
     for_each_vcpu ( d, v )
     {
-        if ( (pfn = pagetable_get_pfn(v->arch.guest_table)) != 0 )
-        {
-            if ( !shadow_mode_refcounts(d) )
-                put_page_type(mfn_to_page(pfn));
-            put_page(mfn_to_page(pfn));
-
+        /* Drop ref to guest_table (from new_guest_cr3(), svm/vmx cr3 handling,
+         * or sh2_update_paging_modes()) */
+        pfn = pagetable_get_pfn(v->arch.guest_table);
+        if ( pfn != 0 )
+        {
+            if ( shadow2_mode_refcounts(d) )
+                put_page(mfn_to_page(pfn));
+            else
+                put_page_and_type(mfn_to_page(pfn));
             v->arch.guest_table = pagetable_null();
         }
 
-        if ( (pfn = pagetable_get_pfn(v->arch.guest_table_user)) != 0 )
-        {
-            if ( !shadow_mode_refcounts(d) )
-                put_page_type(mfn_to_page(pfn));
-            put_page(mfn_to_page(pfn));
-
+#ifdef __x86_64__
+        /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
+        pfn = pagetable_get_pfn(v->arch.guest_table_user);
+        if ( pfn != 0 )
+        {
+            put_page_and_type(mfn_to_page(pfn));
             v->arch.guest_table_user = pagetable_null();
         }
+#endif
     }
 
     if ( d->vcpu[0] && hvm_guest(d->vcpu[0]) )
         hvm_relinquish_guest_resources(d);
 
-    shadow_mode_disable(d);
+    /* Tear down shadow mode stuff. */
+    shadow2_teardown(d);
 
     /*
      * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
@@ -964,26 +969,23 @@ void domain_relinquish_resources(struct 
 
     /* Free page used by xen oprofile buffer */
     free_xenoprof_pages(d);
-
 }
 
 void arch_dump_domain_info(struct domain *d)
 {
-    if ( shadow_mode_enabled(d) )
-    {
-        printk("    shadow mode: ");
-        if ( shadow_mode_refcounts(d) )
+    if ( shadow2_mode_enabled(d) )
+    {
+        printk("    shadow2 mode: ");
+        if ( d->arch.shadow2_mode & SHM2_enable )
+            printk("enabled ");
+        if ( shadow2_mode_refcounts(d) )
             printk("refcounts ");
-        if ( shadow_mode_write_all(d) )
-            printk("write_all ");
-        if ( shadow_mode_log_dirty(d) )
+        if ( shadow2_mode_log_dirty(d) )
             printk("log_dirty ");
-        if ( shadow_mode_translate(d) )
+        if ( shadow2_mode_translate(d) )
             printk("translate ");
-        if ( shadow_mode_external(d) )
+        if ( shadow2_mode_external(d) )
             printk("external ");
-        if ( shadow_mode_wr_pt_pte(d) )
-            printk("wr_pt_pte ");
         printk("\n");
     }
 }
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/domain_build.c
--- a/xen/arch/x86/domain_build.c       Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/domain_build.c       Wed Aug 16 17:02:35 2006 +0100
@@ -683,8 +683,11 @@ int construct_dom0(struct domain *d,
     for ( i = 1; i < opt_dom0_max_vcpus; i++ )
         (void)alloc_vcpu(d, i, i);
 
-    /* Set up monitor table */
-    update_pagetables(v);
+    /* Set up CR3 value for write_ptbase */
+    if ( shadow2_mode_enabled(v->domain) )
+        shadow2_update_paging_modes(v);
+    else
+        update_cr3(v);
 
     /* Install the new page tables. */
     local_irq_disable();
@@ -796,10 +799,8 @@ int construct_dom0(struct domain *d,
     new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start);
 
     if ( opt_dom0_shadow )
-    {
-        shadow_mode_enable(d, SHM_enable);
-        update_pagetables(v);
-    }
+        if ( shadow2_test_enable(d) == 0 ) 
+            shadow2_update_paging_modes(v);
 
     if ( supervisor_mode_kernel )
     {
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c    Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/hvm/hvm.c    Wed Aug 16 17:02:35 2006 +0100
@@ -30,6 +30,7 @@
 #include <xen/hypercall.h>
 #include <xen/guest_access.h>
 #include <xen/event.h>
+#include <xen/shadow.h>
 #include <asm/current.h>
 #include <asm/e820.h>
 #include <asm/io.h>
@@ -42,10 +43,6 @@
 #include <asm/spinlock.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
-#include <asm/shadow.h>
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
 #include <public/sched.h>
 #include <public/hvm/ioreq.h>
 #include <public/version.h>
@@ -61,7 +58,7 @@ static void hvm_zap_mmio_range(
 static void hvm_zap_mmio_range(
     struct domain *d, unsigned long pfn, unsigned long nr_pfn)
 {
-    unsigned long i, val = INVALID_MFN;
+    unsigned long i;
 
     ASSERT(d == current->domain);
 
@@ -70,7 +67,8 @@ static void hvm_zap_mmio_range(
         if ( pfn + i >= 0xfffff )
             break;
 
-        __copy_to_user(&phys_to_machine_mapping[pfn + i], &val, sizeof (val));
+        if ( VALID_MFN(gmfn_to_mfn(d, pfn + i)) )
+            guest_remove_page(d, pfn + i);
     }
 }
 
@@ -262,11 +260,13 @@ void hvm_setup_platform(struct domain* d
     if ( !hvm_guest(v) || (v->vcpu_id != 0) )
         return;
 
+#if 0 /* SHADOW2 does not have this */
     if ( shadow_direct_map_init(d) == 0 )
     {
         printk("Can not allocate shadow direct map for HVM domain.\n");
         domain_crash_synchronous();
     }
+#endif
 
     hvm_zap_iommu_pages(d);
 
@@ -380,6 +380,8 @@ void hvm_hlt(unsigned long rflags)
  */
 int hvm_copy(void *buf, unsigned long vaddr, int size, int dir)
 {
+    struct vcpu *v = current;
+    unsigned long gfn;
     unsigned long mfn;
     char *addr;
     int count;
@@ -389,10 +391,9 @@ int hvm_copy(void *buf, unsigned long va
         if (count > size)
             count = size;
 
-        if (hvm_paging_enabled(current))
-            mfn = gva_to_mfn(vaddr);
-        else
-            mfn = get_mfn_from_gpfn(vaddr >> PAGE_SHIFT);
+        gfn = shadow2_gva_to_gfn(v, vaddr);
+        mfn = mfn_x(sh2_vcpu_gfn_to_mfn(v, gfn));
+
         if (mfn == INVALID_MFN)
             return 0;
 
@@ -545,7 +546,7 @@ void hvm_do_hypercall(struct cpu_user_re
         return;
     }
 
-    if ( current->domain->arch.ops->guest_paging_levels == PAGING_L4 )
+    if ( current->arch.shadow2->guest_levels == 4 )
     {
         pregs->rax = hvm_hypercall64_table[pregs->rax](pregs->rdi,
                                                        pregs->rsi,
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/hvm/platform.c
--- a/xen/arch/x86/hvm/platform.c       Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/hvm/platform.c       Wed Aug 16 17:02:35 2006 +0100
@@ -21,7 +21,7 @@
 #include <xen/config.h>
 #include <xen/types.h>
 #include <xen/mm.h>
-#include <asm/shadow.h>
+#include <xen/shadow.h>
 #include <xen/domain_page.h>
 #include <asm/page.h>
 #include <xen/event.h>
@@ -35,9 +35,6 @@
 #include <xen/lib.h>
 #include <xen/sched.h>
 #include <asm/current.h>
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
 
 #define DECODE_success  1
 #define DECODE_failure  0
@@ -724,7 +721,7 @@ void send_pio_req(struct cpu_user_regs *
 
     if (pvalid) {
         if (hvm_paging_enabled(current))
-            p->u.pdata = (void *) gva_to_gpa(value);
+            p->u.data = shadow2_gva_to_gpa(current, value);
         else
             p->u.pdata = (void *) value; /* guest VA == guest PA */
     } else
@@ -774,7 +771,7 @@ void send_mmio_req(
 
     if (pvalid) {
         if (hvm_paging_enabled(v))
-            p->u.pdata = (void *) gva_to_gpa(value);
+            p->u.data = shadow2_gva_to_gpa(v, value);
         else
             p->u.pdata = (void *) value; /* guest VA == guest PA */
     } else
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/hvm/svm/svm.c
--- a/xen/arch/x86/hvm/svm/svm.c        Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/hvm/svm/svm.c        Wed Aug 16 17:02:35 2006 +0100
@@ -26,9 +26,10 @@
 #include <xen/irq.h>
 #include <xen/softirq.h>
 #include <xen/hypercall.h>
+#include <xen/domain_page.h>
 #include <asm/current.h>
 #include <asm/io.h>
-#include <asm/shadow.h>
+#include <asm/shadow2.h>
 #include <asm/regs.h>
 #include <asm/cpufeature.h>
 #include <asm/processor.h>
@@ -43,10 +44,6 @@
 #include <asm/hvm/svm/emulate.h>
 #include <asm/hvm/svm/vmmcall.h>
 #include <asm/hvm/svm/intr.h>
-#include <asm/shadow.h>
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
 #include <public/sched.h>
 
 #define SVM_EXTRA_DEBUG
@@ -414,7 +411,7 @@ static int svm_realmode(struct vcpu *v)
     return (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE);
 }
 
-static int svm_instruction_length(struct vcpu *v)
+int svm_guest_x86_mode(struct vcpu *v)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
     unsigned long cr0 = vmcb->cr0, eflags = vmcb->rflags, mode;
@@ -423,10 +420,20 @@ static int svm_instruction_length(struct
         mode = vmcb->cs.attributes.fields.l ? 8 : 4;
     else
         mode = (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE) ? 2 : 4;
-    return svm_instrlen(guest_cpu_user_regs(), mode);
-}
-
-static unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num)
+    return mode;
+}
+
+int svm_instruction_length(struct vcpu *v)
+{
+    return svm_instrlen(guest_cpu_user_regs(), svm_guest_x86_mode(v));
+}
+
+void svm_update_host_cr3(struct vcpu *v)
+{
+    /* SVM doesn't have a HOST_CR3 equivalent to update. */
+}
+
+unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num)
 {
     switch ( num )
     {
@@ -436,6 +443,8 @@ static unsigned long svm_get_ctrl_reg(st
         return v->arch.hvm_svm.cpu_cr2;
     case 3:
         return v->arch.hvm_svm.cpu_cr3;
+    case 4:
+        return v->arch.hvm_svm.cpu_shadow_cr4;
     default:
         BUG();
     }
@@ -524,8 +533,6 @@ static void svm_init_hypercall_page(stru
     /* Don't support HYPERVISOR_iret at the moment */
     *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
 }
-
-
 
 
 int svm_dbg_on = 0;
@@ -647,6 +654,11 @@ static void svm_load_cpu_guest_regs(
     svm_load_cpu_user_regs(v, regs);
 }
 
+int svm_long_mode_enabled(struct vcpu *v)
+{
+    return SVM_LONG_GUEST(v);
+}
+
 
 
 static void arch_svm_do_launch(struct vcpu *v) 
@@ -726,7 +738,6 @@ static void svm_final_setup_guest(struct
 static void svm_final_setup_guest(struct vcpu *v)
 {
     struct domain *d = v->domain;
-    struct vcpu *vc;
 
     v->arch.schedule_tail    = arch_svm_do_launch;
     v->arch.ctxt_switch_from = svm_ctxt_switch_from;
@@ -735,9 +746,12 @@ static void svm_final_setup_guest(struct
     if ( v != d->vcpu[0] )
         return;
 
-    /* Initialize monitor page table */
-    for_each_vcpu( d, vc )
-        vc->arch.monitor_table = pagetable_null();
+    if ( !shadow2_mode_external(d) )
+    {
+        DPRINTK("Can't init HVM for dom %u vcpu %u: "
+                "not in shadow2 external mode\n", d->domain_id, v->vcpu_id);
+        domain_crash(d);
+    }
 
     /* 
      * Required to do this once per domain
@@ -745,13 +759,6 @@ static void svm_final_setup_guest(struct
      */
     memset(&d->shared_info->evtchn_mask[0], 0xff, 
            sizeof(d->shared_info->evtchn_mask));       
-
-    /* 
-     * Put the domain in shadow mode even though we're going to be using
-     * the shared 1:1 page table initially. It shouldn't hurt 
-     */
-    shadow_mode_enable(d, SHM_enable|SHM_refcounts|
-                       SHM_translate|SHM_external|SHM_wr_pt_pte);
 }
 
 
@@ -809,9 +816,13 @@ int start_svm(void)
 
     hvm_funcs.realmode = svm_realmode;
     hvm_funcs.paging_enabled = svm_paging_enabled;
+    hvm_funcs.long_mode_enabled = svm_long_mode_enabled;
+    hvm_funcs.guest_x86_mode = svm_guest_x86_mode;
     hvm_funcs.instruction_length = svm_instruction_length;
     hvm_funcs.get_guest_ctrl_reg = svm_get_ctrl_reg;
 
+    hvm_funcs.update_host_cr3 = svm_update_host_cr3;
+    
     hvm_funcs.stts = svm_stts;
     hvm_funcs.set_tsc_offset = svm_set_tsc_offset;
 
@@ -834,7 +845,6 @@ static void svm_relinquish_guest_resourc
             continue;
 
         destroy_vmcb(&v->arch.hvm_svm);
-        free_monitor_pagetable(v);
         kill_timer(&v->arch.hvm_vcpu.hlt_timer);
         if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) ) 
         {
@@ -851,8 +861,6 @@ static void svm_relinquish_guest_resourc
 
     if ( d->arch.hvm_domain.buffered_io_va )
         unmap_domain_page_global((void *)d->arch.hvm_domain.buffered_io_va);
-
-    shadow_direct_map_clean(d);
 }
 
 
@@ -894,7 +902,6 @@ static int svm_do_page_fault(unsigned lo
 {
     struct vcpu *v = current;
     unsigned long eip;
-    unsigned long gpa; /* FIXME: PAE */
     int result;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
 
@@ -907,43 +914,7 @@ static int svm_do_page_fault(unsigned lo
             va, eip, (unsigned long)regs->error_code);
 //#endif
 
-    if ( !svm_paging_enabled(v) )
-    {
-        if ( shadow_direct_map_fault(va, regs) ) 
-            return 1;
-
-        handle_mmio(va, va);
-        return 1;
-    }
-
-
-    gpa = gva_to_gpa(va);
-
-    /* Use 1:1 page table to identify MMIO address space */
-    if (mmio_space(gpa))
-    {
-        /* No support for APIC */
-        if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000)
-        { 
-            int inst_len;
-            inst_len = svm_instruction_length(v);
-            if (inst_len == -1)
-            {
-                printf("%s: INST_LEN - Unable to decode properly\n", __func__);
-                domain_crash_synchronous();
-            }
-
-            __update_guest_eip(vmcb, inst_len);
-
-            return 1;
-        }
-
-        handle_mmio(va, gpa);
-
-        return 1;
-    }
-    
-    result = shadow_fault(va, regs);
+    result = shadow2_fault(va, regs); 
 
     if( result ) {
         /* Let's make sure that the Guest TLB is flushed */
@@ -1035,19 +1006,12 @@ static void svm_vmexit_do_cpuid(struct v
             clear_bit(X86_FEATURE_APIC, &edx);
         }
 
-#if CONFIG_PAGING_LEVELS < 3
-        clear_bit(X86_FEATURE_PAE, &edx);
-        clear_bit(X86_FEATURE_PSE, &edx);
+#if CONFIG_PAGING_LEVELS >= 3
+        if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
+#endif
+            clear_bit(X86_FEATURE_PAE, &edx);
         clear_bit(X86_FEATURE_PSE36, &edx);
-#else
-        if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
-        {
-            if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
-                clear_bit(X86_FEATURE_PAE, &edx);
-            clear_bit(X86_FEATURE_PSE, &edx);
-            clear_bit(X86_FEATURE_PSE36, &edx);
-        }
-#endif
+
         /* Clear out reserved bits. */
         ecx &= ~SVM_VCPU_CPUID_L1_ECX_RESERVED;
         edx &= ~SVM_VCPU_CPUID_L1_EDX_RESERVED;
@@ -1097,23 +1061,12 @@ static void svm_vmexit_do_cpuid(struct v
         clear_bit(X86_FEATURE_SYSCALL & 31, &edx);
 #endif
 
-#if CONFIG_PAGING_LEVELS < 3
-        clear_bit(X86_FEATURE_NX & 31, &edx);
-        clear_bit(X86_FEATURE_PAE, &edx);
-        clear_bit(X86_FEATURE_PSE, &edx);
+
+#if CONFIG_PAGING_LEVELS >= 3
+        if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
+#endif
+            clear_bit(X86_FEATURE_PAE, &edx);
         clear_bit(X86_FEATURE_PSE36, &edx);
-#else
-        if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
-        {
-            if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
-            {
-                clear_bit(X86_FEATURE_NX & 31, &edx);
-                clear_bit(X86_FEATURE_PAE, &edx);
-            }
-            clear_bit(X86_FEATURE_PSE, &edx);
-            clear_bit(X86_FEATURE_PSE36, &edx);
-        }
-#endif
 
         /* Make SVM feature invisible to the guest. */
         clear_bit(X86_FEATURE_SVME & 31, &ecx);
@@ -1555,6 +1508,7 @@ static int svm_set_cr0(unsigned long val
     unsigned long mfn;
     int paging_enabled;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+    unsigned long old_base_mfn;
   
     ASSERT(vmcb);
 
@@ -1600,54 +1554,21 @@ static int svm_set_cr0(unsigned long val
             set_bit(SVM_CPU_STATE_LMA_ENABLED,
                     &v->arch.hvm_svm.cpu_state);
             vmcb->efer |= (EFER_LMA | EFER_LME);
-            if (!shadow_set_guest_paging_levels(v->domain, PAGING_L4) )
-            {
-                printk("Unsupported guest paging levels\n");
-                domain_crash_synchronous(); /* need to take a clean path */
-            }
-        }
-        else
+        }
 #endif  /* __x86_64__ */
-        {
-#if CONFIG_PAGING_LEVELS >= 3
-            /* seems it's a 32-bit or 32-bit PAE guest */
-            if ( test_bit(SVM_CPU_STATE_PAE_ENABLED,
-                        &v->arch.hvm_svm.cpu_state) )
-            {
-                /* The guest enables PAE first and then it enables PG, it is
-                 * really a PAE guest */
-                if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
-                {
-                    printk("Unsupported guest paging levels\n");
-                    domain_crash_synchronous();
-                }
-            }
-            else
-            {
-                if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L2) )
-                {
-                    printk("Unsupported guest paging levels\n");
-                    domain_crash_synchronous(); /* need to take a clean path */
-                }
-            }
-#endif
-        }
 
         /* Now arch.guest_table points to machine physical. */
+        old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
         v->arch.guest_table = pagetable_from_pfn(mfn);
-        update_pagetables(v);
+        if ( old_base_mfn )
+            put_page(mfn_to_page(old_base_mfn));
+        shadow2_update_paging_modes(v);
 
         HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", 
                 (unsigned long) (mfn << PAGE_SHIFT));
 
+        vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; 
         set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
-        vmcb->cr3 = pagetable_get_paddr(v->arch.shadow_table);
-
-        /* arch->shadow_table should hold the next CR3 for shadow */
-        HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx\n", 
-                    v->arch.hvm_svm.cpu_cr3, mfn);
-
-        return 1;
     }
 
     if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
@@ -1667,17 +1588,16 @@ static int svm_set_cr0(unsigned long val
             svm_inject_exception(v, TRAP_gp_fault, 1, 0);
             return 0;
         }
-
-        clear_all_shadow_status( v->domain );
+        shadow2_update_paging_modes(v);
+        vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
         set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
-        vmcb->cr3 = pagetable_get_paddr(v->domain->arch.phys_table);
     }
     else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
     {
         /* we should take care of this kind of situation */
-        clear_all_shadow_status(v->domain);
+        shadow2_update_paging_modes(v);
+        vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
         set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
-        vmcb->cr3 = pagetable_get_paddr(v->domain->arch.phys_table);
     }
 
     return 1;
@@ -1786,7 +1706,7 @@ static int mov_to_cr(int gpreg, int cr, 
             mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
             if (mfn != pagetable_get_pfn(v->arch.guest_table))
                 __hvm_bug(regs);
-            shadow_sync_all(v->domain);
+            shadow2_update_cr3(v);
         }
         else 
         {
@@ -1812,14 +1732,10 @@ static int mov_to_cr(int gpreg, int cr, 
             /*
              * arch.shadow_table should now hold the next CR3 for shadow
              */
-#if CONFIG_PAGING_LEVELS >= 3
-            if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 )
-                shadow_sync_all(v->domain);
-#endif
             v->arch.hvm_svm.cpu_cr3 = value;
-            update_pagetables(v);
+            update_cr3(v);
+            vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; 
             HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
-            vmcb->cr3 = pagetable_get_paddr(v->arch.shadow_table);
         }
         break;
     }
@@ -1838,12 +1754,6 @@ static int mov_to_cr(int gpreg, int cr, 
                 /* The guest is a 32-bit PAE guest. */
 #if CONFIG_PAGING_LEVELS >= 3
                 unsigned long mfn, old_base_mfn;
-
-                if( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
-                {
-                    printk("Unsupported guest paging levels\n");
-                    domain_crash_synchronous(); /* need to take a clean path */
-                }
 
                 if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
                                     v->arch.hvm_svm.cpu_cr3 >> PAGE_SHIFT)) ||
@@ -1853,21 +1763,20 @@ static int mov_to_cr(int gpreg, int cr, 
                     domain_crash_synchronous(); /* need to take a clean path */
                 }
 
-                old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
-                if ( old_base_mfn )
-                    put_page(mfn_to_page(old_base_mfn));
-
                 /*
                  * Now arch.guest_table points to machine physical.
                  */
 
+                old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
                 v->arch.guest_table = pagetable_from_pfn(mfn);
-                update_pagetables(v);
+                if ( old_base_mfn )
+                    put_page(mfn_to_page(old_base_mfn));
+                shadow2_update_paging_modes(v);
 
                 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
                             (unsigned long) (mfn << PAGE_SHIFT));
 
-                vmcb->cr3 = pagetable_get_paddr(v->arch.shadow_table);
+                vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; 
 
                 /*
                  * arch->shadow_table should hold the next CR3 for shadow
@@ -1876,33 +1785,6 @@ static int mov_to_cr(int gpreg, int cr, 
                 HVM_DBG_LOG(DBG_LEVEL_VMMU, 
                             "Update CR3 value = %lx, mfn = %lx",
                             v->arch.hvm_svm.cpu_cr3, mfn);
-#endif
-            }
-            else
-            {
-                /*  The guest is a 64 bit or 32-bit PAE guest. */
-#if CONFIG_PAGING_LEVELS >= 3
-                if ( (v->domain->arch.ops != NULL) &&
-                        v->domain->arch.ops->guest_paging_levels == PAGING_L2)
-                {
-                    /* Seems the guest first enables PAE without enabling PG,
-                     * it must enable PG after that, and it is a 32-bit PAE
-                     * guest */
-
-                    if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3))
-                    {
-                        printk("Unsupported guest paging levels\n");
-                        domain_crash_synchronous();
-                    }                   
-                }
-                else
-                {
-                    if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L4))
-                    {
-                        printk("Unsupported guest paging levels\n");
-                        domain_crash_synchronous();
-                    }
-                }
 #endif
             }
         }
@@ -1926,7 +1808,7 @@ static int mov_to_cr(int gpreg, int cr, 
         if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE))
         {
             set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
-            shadow_sync_all(v->domain);
+            shadow2_update_paging_modes(v);
         }
         break;
     }
@@ -2267,7 +2149,7 @@ void svm_handle_invlpg(const short invlp
 
     /* Overkill, we may not this */
     set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
-    shadow_invlpg(v, g_vaddr);
+    shadow2_invlpg(v, g_vaddr);
 }
 
 
@@ -2638,7 +2520,7 @@ void walk_shadow_and_guest_pt(unsigned l
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
     unsigned long gpa;
 
-    gpa = gva_to_gpa( gva );
+    gpa = shadow2_gva_to_gpa(current, gva);
     printk( "gva = %lx, gpa=%lx, gCR3=%x\n", gva, gpa, (u32)vmcb->cr3 );
     if( !svm_paging_enabled(v) || mmio_space(gpa) )
        return;
@@ -2662,8 +2544,12 @@ void walk_shadow_and_guest_pt(unsigned l
     __copy_from_user(&gpte, &linear_pg_table[ l1_linear_offset(gva) ],
                      sizeof(gpte) );
     printk( "G-PTE = %x, flags=%x\n", gpte.l1, l1e_get_flags(gpte) );
-    __copy_from_user( &spte, &phys_to_machine_mapping[ l1e_get_pfn( gpte ) ],
+
+    BUG(); // need to think about this, and convert usage of
+           // phys_to_machine_mapping to use pagetable format...
+    __copy_from_user( &spte, &phys_to_machine_mapping[ l1e_get_pfn( gpte ) ], 
                       sizeof(spte) );
+
     printk( "S-PTE = %x, flags=%x\n", spte.l1, l1e_get_flags(spte));
 }
 #endif /* SVM_WALK_GUEST_PAGES */
@@ -2704,7 +2590,8 @@ asmlinkage void svm_vmexit_handler(struc
 
     if (svm_dbg_on && exit_reason == VMEXIT_EXCEPTION_PF) 
     {
-        if (svm_paging_enabled(v) && !mmio_space(gva_to_gpa(vmcb->exitinfo2)))
+        if (svm_paging_enabled(v) && 
+            !mmio_space(shadow2_gva_to_gpa(current, vmcb->exitinfo2)))
         {
             printk("I%08ld,ExC=%s(%d),IP=%x:%llx,I1=%llx,I2=%llx,INT=%llx, "
                    "gpa=%llx\n", intercepts_counter,
@@ -2713,7 +2600,7 @@ asmlinkage void svm_vmexit_handler(struc
                    (unsigned long long) vmcb->exitinfo1,
                    (unsigned long long) vmcb->exitinfo2,
                    (unsigned long long) vmcb->exitintinfo.bytes,
-            (unsigned long long) gva_to_gpa( vmcb->exitinfo2 ) );
+            (unsigned long long) shadow2_gva_to_gpa(current, vmcb->exitinfo2));
         }
         else 
         {
@@ -2757,7 +2644,7 @@ asmlinkage void svm_vmexit_handler(struc
         && ( ( vmcb->exitinfo2 == vmcb->rip )
         || vmcb->exitintinfo.bytes) )
     {
-       if (svm_paging_enabled(v) && !mmio_space(gva_to_gpa(vmcb->exitinfo2)))  
   
+       if (svm_paging_enabled(v) && !mmio_space(gva_to_gpa(vmcb->exitinfo2)))
            walk_shadow_and_guest_pt( vmcb->exitinfo2 );
     }
 #endif
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/hvm/svm/vmcb.c
--- a/xen/arch/x86/hvm/svm/vmcb.c       Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/hvm/svm/vmcb.c       Wed Aug 16 17:02:35 2006 +0100
@@ -380,8 +380,8 @@ void svm_do_launch(struct vcpu *v)
         printk("%s: phys_table   = %lx\n", __func__, pt);
     }
 
-    /* At launch we always use the phys_table */
-    vmcb->cr3 = pagetable_get_paddr(v->domain->arch.phys_table);
+    /* Set cr3 from hw_cr3 even when guest-visible paging is not enabled */
+    vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; 
 
     if (svm_dbg_on) 
     {
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/hvm/vlapic.c
--- a/xen/arch/x86/hvm/vlapic.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/hvm/vlapic.c Wed Aug 16 17:02:35 2006 +0100
@@ -21,7 +21,8 @@
 #include <xen/types.h>
 #include <xen/mm.h>
 #include <xen/xmalloc.h>
-#include <asm/shadow.h>
+#include <xen/shadow.h>
+#include <xen/domain_page.h>
 #include <asm/page.h>
 #include <xen/event.h>
 #include <xen/trace.h>
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/hvm/vmx/vmcs.c
--- a/xen/arch/x86/hvm/vmx/vmcs.c       Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/hvm/vmx/vmcs.c       Wed Aug 16 17:02:35 2006 +0100
@@ -34,12 +34,8 @@
 #include <asm/flushtlb.h>
 #include <xen/event.h>
 #include <xen/kernel.h>
-#include <asm/shadow.h>
 #include <xen/keyhandler.h>
-
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
+#include <asm/shadow2.h>
 
 static int vmcs_size;
 static int vmcs_order;
@@ -238,7 +234,7 @@ static void vmx_set_host_env(struct vcpu
 
 static void vmx_do_launch(struct vcpu *v)
 {
-/* Update CR3, GDT, LDT, TR */
+/* Update CR3, CR0, CR4, GDT, LDT, TR */
     unsigned int  error = 0;
     unsigned long cr0, cr4;
 
@@ -276,8 +272,11 @@ static void vmx_do_launch(struct vcpu *v
     error |= __vmwrite(GUEST_TR_BASE, 0);
     error |= __vmwrite(GUEST_TR_LIMIT, 0xff);
 
-    __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
-    __vmwrite(HOST_CR3, pagetable_get_paddr(v->arch.monitor_table));
+    shadow2_update_paging_modes(v);
+    printk("%s(): GUEST_CR3<=%08lx, HOST_CR3<=%08lx\n",
+           __func__, v->arch.hvm_vcpu.hw_cr3, v->arch.cr3);
+    __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
+    __vmwrite(HOST_CR3, v->arch.cr3);
 
     v->arch.schedule_tail = arch_vmx_do_resume;
 
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c        Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/hvm/vmx/vmx.c        Wed Aug 16 17:02:35 2006 +0100
@@ -26,9 +26,9 @@
 #include <xen/softirq.h>
 #include <xen/domain_page.h>
 #include <xen/hypercall.h>
+#include <xen/perfc.h>
 #include <asm/current.h>
 #include <asm/io.h>
-#include <asm/shadow.h>
 #include <asm/regs.h>
 #include <asm/cpufeature.h>
 #include <asm/processor.h>
@@ -40,10 +40,7 @@
 #include <asm/hvm/vmx/vmx.h>
 #include <asm/hvm/vmx/vmcs.h>
 #include <asm/hvm/vmx/cpu.h>
-#include <asm/shadow.h>
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
+#include <asm/shadow2.h>
 #include <public/sched.h>
 #include <public/hvm/ioreq.h>
 #include <asm/hvm/vpic.h>
@@ -69,11 +66,16 @@ static int vmx_initialize_guest_resource
     if ( v->vcpu_id != 0 )
         return 1;
 
+    if ( !shadow2_mode_external(d) )
+    {
+        DPRINTK("Can't init HVM for dom %u vcpu %u: "
+                "not in shadow2 external mode\n", 
+                d->domain_id, v->vcpu_id);
+        domain_crash(d);
+    }
+
     for_each_vcpu ( d, vc )
     {
-        /* Initialize monitor page table */
-        vc->arch.monitor_table = pagetable_null();
-
         memset(&vc->arch.hvm_vmx, 0, sizeof(struct arch_vmx_struct));
 
         if ( (rc = vmx_create_vmcs(vc)) != 0 )
@@ -107,6 +109,7 @@ static int vmx_initialize_guest_resource
 
         vc->arch.hvm_vmx.io_bitmap_a = io_bitmap_a;
         vc->arch.hvm_vmx.io_bitmap_b = io_bitmap_b;
+
     }
 
     /*
@@ -116,11 +119,6 @@ static int vmx_initialize_guest_resource
     memset(&d->shared_info->evtchn_mask[0], 0xff,
            sizeof(d->shared_info->evtchn_mask));
 
-    /* Put the domain in shadow mode even though we're going to be using
-     * the shared 1:1 page table initially. It shouldn't hurt */
-    shadow_mode_enable(
-        d, SHM_enable|SHM_refcounts|SHM_translate|SHM_external|SHM_wr_pt_pte);
-
     return 1;
 }
 
@@ -133,7 +131,6 @@ static void vmx_relinquish_guest_resourc
         vmx_destroy_vmcs(v);
         if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
             continue;
-        free_monitor_pagetable(v);
         kill_timer(&v->arch.hvm_vcpu.hlt_timer);
         if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) )
         {
@@ -153,8 +150,6 @@ static void vmx_relinquish_guest_resourc
 
     if ( d->arch.hvm_domain.buffered_io_va )
         unmap_domain_page_global((void *)d->arch.hvm_domain.buffered_io_va);
-
-    shadow_direct_map_clean(d);
 }
 
 #ifdef __x86_64__
@@ -595,14 +590,6 @@ static void vmx_load_cpu_guest_regs(stru
     vmx_vmcs_exit(v);
 }
 
-static int vmx_realmode(struct vcpu *v)
-{
-    unsigned long rflags;
-
-    __vmread(GUEST_RFLAGS, &rflags);
-    return rflags & X86_EFLAGS_VM;
-}
-
 static int vmx_instruction_length(struct vcpu *v)
 {
     unsigned long inst_len;
@@ -622,6 +609,8 @@ static unsigned long vmx_get_ctrl_reg(st
         return v->arch.hvm_vmx.cpu_cr2;
     case 3:
         return v->arch.hvm_vmx.cpu_cr3;
+    case 4:
+        return v->arch.hvm_vmx.cpu_shadow_cr4;
     default:
         BUG();
     }
@@ -753,8 +742,12 @@ static void vmx_setup_hvm_funcs(void)
 
     hvm_funcs.realmode = vmx_realmode;
     hvm_funcs.paging_enabled = vmx_paging_enabled;
+    hvm_funcs.long_mode_enabled = vmx_long_mode_enabled;
+    hvm_funcs.guest_x86_mode = vmx_guest_x86_mode;
     hvm_funcs.instruction_length = vmx_instruction_length;
     hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg;
+
+    hvm_funcs.update_host_cr3 = vmx_update_host_cr3;
 
     hvm_funcs.stts = vmx_stts;
     hvm_funcs.set_tsc_offset = vmx_set_tsc_offset;
@@ -855,53 +848,25 @@ static void inline __update_guest_eip(un
     __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
 }
 
-
 static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
 {
-    unsigned long gpa; /* FIXME: PAE */
     int result;
 
 #if 0 /* keep for debugging */
     {
-        unsigned long eip;
-
+        unsigned long eip, cs;
+
+        __vmread(GUEST_CS_BASE, &cs);
         __vmread(GUEST_RIP, &eip);
         HVM_DBG_LOG(DBG_LEVEL_VMMU,
-                    "vmx_do_page_fault = 0x%lx, eip = %lx, error_code = %lx",
-                    va, eip, (unsigned long)regs->error_code);
+                    "vmx_do_page_fault = 0x%lx, cs_base=%lx, "
+                    "eip = %lx, error_code = %lx\n",
+                    va, cs, eip, (unsigned long)regs->error_code);
     }
 #endif
 
-    if ( !vmx_paging_enabled(current) )
-    {
-        /* construct 1-to-1 direct mapping */
-        if ( shadow_direct_map_fault(va, regs) ) 
-            return 1;
-
-        handle_mmio(va, va);
-        TRACE_VMEXIT (2,2);
-        return 1;
-    }
-    gpa = gva_to_gpa(va);
-
-    /* Use 1:1 page table to identify MMIO address space */
-    if ( mmio_space(gpa) ){
-        struct vcpu *v = current;
-        /* No support for APIC */
-        if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000) { 
-            u32 inst_len;
-            __vmread(VM_EXIT_INSTRUCTION_LEN, &(inst_len));
-            __update_guest_eip(inst_len);
-            return 1;
-        }
-        TRACE_VMEXIT (2,2);
-        /* in the case of MMIO, we are more interested in gpa than in va */
-        TRACE_VMEXIT (4,gpa);
-        handle_mmio(va, gpa);
-        return 1;
-    }
-
-    result = shadow_fault(va, regs);
+    result = shadow2_fault(va, regs);
+
     TRACE_VMEXIT (2,result);
 #if 0
     if ( !result )
@@ -972,23 +937,11 @@ static void vmx_vmexit_do_cpuid(struct c
                 clear_bit(X86_FEATURE_APIC, &edx);
             }
     
-#if CONFIG_PAGING_LEVELS < 3
-            edx &= ~(bitmaskof(X86_FEATURE_PAE)  |
-                     bitmaskof(X86_FEATURE_PSE)  |
-                     bitmaskof(X86_FEATURE_PSE36));
-#else
-            if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
-            {
-                if ( v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
-                    clear_bit(X86_FEATURE_PSE36, &edx);
-                else
-                {
-                    clear_bit(X86_FEATURE_PAE, &edx);
-                    clear_bit(X86_FEATURE_PSE, &edx);
-                    clear_bit(X86_FEATURE_PSE36, &edx);
-                }
-            }
+#if CONFIG_PAGING_LEVELS >= 3
+            if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
 #endif
+                clear_bit(X86_FEATURE_PAE, &edx);
+            clear_bit(X86_FEATURE_PSE36, &edx);
 
             ebx &= NUM_THREADS_RESET_MASK;  
 
@@ -1086,7 +1039,7 @@ static void vmx_vmexit_do_invlpg(unsigne
      * We do the safest things first, then try to update the shadow
      * copying from guest
      */
-    shadow_invlpg(v, va);
+    shadow2_invlpg(v, va);
 }
 
 
@@ -1307,11 +1260,8 @@ vmx_world_restore(struct vcpu *v, struct
 
     error |= __vmwrite(CR0_READ_SHADOW, c->cr0);
 
-    if (!vmx_paging_enabled(v)) {
-        HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
-        __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
+    if (!vmx_paging_enabled(v))
         goto skip_cr3;
-    }
 
     if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
         /*
@@ -1325,7 +1275,6 @@ vmx_world_restore(struct vcpu *v, struct
             domain_crash_synchronous();
             return 0;
         }
-        shadow_sync_all(v->domain);
     } else {
         /*
          * If different, make a shadow. Check if the PDBR is valid
@@ -1348,12 +1297,16 @@ vmx_world_restore(struct vcpu *v, struct
          * arch.shadow_table should now hold the next CR3 for shadow
          */
         v->arch.hvm_vmx.cpu_cr3 = c->cr3;
-        update_pagetables(v);
+    }
+
+ skip_cr3:
+
+    shadow2_update_paging_modes(v);
+    if (!vmx_paging_enabled(v))
+        HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
+    else
         HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
-        __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
-    }
-
- skip_cr3:
+    __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
 
     error |= __vmread(CR4_READ_SHADOW, &old_cr4);
     error |= __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
@@ -1485,6 +1438,7 @@ static int vmx_set_cr0(unsigned long val
     int paging_enabled;
     unsigned long vm_entry_value;
     unsigned long old_cr0;
+    unsigned long old_base_mfn;
 
     /*
      * CR0: We don't want to lose PE and PG.
@@ -1514,7 +1468,8 @@ static int vmx_set_cr0(unsigned long val
             v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
              !get_page(mfn_to_page(mfn), v->domain) )
         {
-            printk("Invalid CR3 value = %lx", v->arch.hvm_vmx.cpu_cr3);
+            printk("Invalid CR3 value = %lx (mfn=%lx)\n", 
+                   v->arch.hvm_vmx.cpu_cr3, mfn);
             domain_crash_synchronous(); /* need to take a clean path */
         }
 
@@ -1539,51 +1494,22 @@ static int vmx_set_cr0(unsigned long val
             __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
             vm_entry_value |= VM_ENTRY_CONTROLS_IA32E_MODE;
             __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
-
-            if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L4) )
-            {
-                printk("Unsupported guest paging levels\n");
-                domain_crash_synchronous(); /* need to take a clean path */
-            }
-        }
-        else
-#endif  /* __x86_64__ */
-        {
-#if CONFIG_PAGING_LEVELS >= 3
-            /* seems it's a 32-bit or 32-bit PAE guest */
-
-            if ( test_bit(VMX_CPU_STATE_PAE_ENABLED,
-                        &v->arch.hvm_vmx.cpu_state) )
-            {
-                /* The guest enables PAE first and then it enables PG, it is
-                 * really a PAE guest */
-                if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
-                {
-                    printk("Unsupported guest paging levels\n");
-                    domain_crash_synchronous();
-                }
-            }
-            else
-            {
-                if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L2) )
-                {
-                    printk("Unsupported guest paging levels\n");
-                    domain_crash_synchronous(); /* need to take a clean path */
-                }
-            }
+        }
 #endif
-        }
 
         /*
          * Now arch.guest_table points to machine physical.
          */
+        old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
         v->arch.guest_table = pagetable_from_pfn(mfn);
-        update_pagetables(v);
+        if (old_base_mfn)
+            put_page(mfn_to_page(old_base_mfn));
+        shadow2_update_paging_modes(v);
 
         HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
                     (unsigned long) (mfn << PAGE_SHIFT));
 
-        __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
+        __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
         /*
          * arch->shadow_table should hold the next CR3 for shadow
          */
@@ -1625,7 +1551,6 @@ static int vmx_set_cr0(unsigned long val
             }
         }
 
-        clear_all_shadow_status(v->domain);
         if ( vmx_assist(v, VMX_ASSIST_INVOKE) ) {
             set_bit(VMX_CPU_STATE_ASSIST_ENABLED, &v->arch.hvm_vmx.cpu_state);
             __vmread(GUEST_RIP, &eip);
@@ -1651,9 +1576,8 @@ static int vmx_set_cr0(unsigned long val
     }
     else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
     {
-        /* we should take care of this kind of situation */
-        clear_all_shadow_status(v->domain);
-        __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
+        __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
+        shadow2_update_paging_modes(v);
     }
 
     return 1;
@@ -1738,7 +1662,7 @@ static int mov_to_cr(int gp, int cr, str
             mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
             if (mfn != pagetable_get_pfn(v->arch.guest_table))
                 __hvm_bug(regs);
-            shadow_sync_all(v->domain);
+            shadow2_update_cr3(v);
         } else {
             /*
              * If different, make a shadow. Check if the PDBR is valid
@@ -1759,16 +1683,11 @@ static int mov_to_cr(int gp, int cr, str
             /*
              * arch.shadow_table should now hold the next CR3 for shadow
              */
-#if CONFIG_PAGING_LEVELS >= 3
-            if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 )
-                shadow_sync_all(v->domain);
-#endif
-
             v->arch.hvm_vmx.cpu_cr3 = value;
-            update_pagetables(v);
+            update_cr3(v);
             HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx",
                         value);
-            __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
+            __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
         }
         break;
     }
@@ -1785,12 +1704,6 @@ static int mov_to_cr(int gp, int cr, str
                 /* The guest is a 32-bit PAE guest. */
 #if CONFIG_PAGING_LEVELS >= 3
                 unsigned long mfn, old_base_mfn;
-
-                if( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
-                {
-                    printk("Unsupported guest paging levels\n");
-                    domain_crash_synchronous(); /* need to take a clean path */
-                }
 
                 if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
                                     v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
@@ -1800,21 +1713,20 @@ static int mov_to_cr(int gp, int cr, str
                     domain_crash_synchronous(); /* need to take a clean path */
                 }
 
-                old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
-                if ( old_base_mfn )
-                    put_page(mfn_to_page(old_base_mfn));
 
                 /*
                  * Now arch.guest_table points to machine physical.
                  */
 
+                old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
                 v->arch.guest_table = pagetable_from_pfn(mfn);
-                update_pagetables(v);
+                if ( old_base_mfn )
+                    put_page(mfn_to_page(old_base_mfn));
 
                 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
                             (unsigned long) (mfn << PAGE_SHIFT));
 
-                __vmwrite(GUEST_CR3, 
pagetable_get_paddr(v->arch.shadow_table));
+                __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
 
                 /*
                  * arch->shadow_table should hold the next CR3 for shadow
@@ -1822,27 +1734,6 @@ static int mov_to_cr(int gp, int cr, str
 
                 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = 
%lx",
                             v->arch.hvm_vmx.cpu_cr3, mfn);
-#endif
-            }
-            else
-            {
-                /*  The guest is a 64 bit or 32-bit PAE guest. */
-#if CONFIG_PAGING_LEVELS >= 3
-                if ( (v->domain->arch.ops != NULL) &&
-                        v->domain->arch.ops->guest_paging_levels == PAGING_L2)
-                {
-                    /* Seems the guest first enables PAE without enabling PG,
-                     * it must enable PG after that, and it is a 32-bit PAE
-                     * guest */
-
-                    if ( !shadow_set_guest_paging_levels(v->domain,
-                                                            PAGING_L3) )
-                    {
-                        printk("Unsupported guest paging levels\n");
-                        /* need to take a clean path */
-                        domain_crash_synchronous();
-                    }
-                }
 #endif
             }
         }
@@ -1864,8 +1755,7 @@ static int mov_to_cr(int gp, int cr, str
          * all TLB entries except global entries.
          */
         if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
-            shadow_sync_all(v->domain);
-
+            shadow2_update_paging_modes(v);
         break;
     }
     default:
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/mm.c Wed Aug 16 17:02:35 2006 +0100
@@ -137,7 +137,7 @@ static void free_l1_table(struct page_in
 
 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long,
                         unsigned long type);
-static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
+static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t, unsigned long gl1mfn);
 
 /* Used to defer flushing of memory structures. */
 struct percpu_mm_info {
@@ -274,9 +274,9 @@ void share_xen_page_with_privileged_gues
 #else
 /*
  * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
- * We cannot safely shadow the idle page table, nor shadow-mode page tables
+ * We cannot safely shadow the idle page table, nor shadow (v1) page tables
  * (detected by lack of an owning domain). As required for correctness, we
- * always shadow PDPTs aboive 4GB.
+ * always shadow PDPTs above 4GB.
  */
 #define l3tab_needs_shadow(mfn)                         \
     (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
@@ -297,17 +297,21 @@ static int __init cache_pae_fixmap_addre
 }
 __initcall(cache_pae_fixmap_address);
 
-static void __write_ptbase(unsigned long mfn)
+static DEFINE_PER_CPU(u32, make_cr3_timestamp);
+
+void make_cr3(struct vcpu *v, unsigned long mfn)
+/* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
+ * necessary, and sets v->arch.cr3 to the value to load in CR3. */
 {
     l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
-    struct pae_l3_cache *cache = &current->arch.pae_l3_cache;
+    struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
     unsigned int cpu = smp_processor_id();
 
-    /* Fast path 1: does this mfn need a shadow at all? */
+    /* Fast path: does this mfn need a shadow at all? */
     if ( !l3tab_needs_shadow(mfn) )
     {
-        write_cr3(mfn << PAGE_SHIFT);
-        /* Cache is no longer in use or valid (/after/ write to %cr3). */
+        v->arch.cr3 = mfn << PAGE_SHIFT;
+        /* Cache is no longer in use or valid */
         cache->high_mfn = 0;
         return;
     }
@@ -315,13 +319,6 @@ static void __write_ptbase(unsigned long
     /* Caching logic is not interrupt safe. */
     ASSERT(!in_irq());
 
-    /* Fast path 2: is this mfn already cached? */
-    if ( cache->high_mfn == mfn )
-    {
-        write_cr3(__pa(cache->table[cache->inuse_idx]));
-        return;
-    }
-
     /* Protects against pae_flush_pgd(). */
     spin_lock(&cache->lock);
 
@@ -330,29 +327,33 @@ static void __write_ptbase(unsigned long
 
     /* Map the guest L3 table and copy to the chosen low-memory cache. */
     *(fix_pae_highmem_pl1e - cpu) = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
+    /* First check the previous high mapping can't be in the TLB. 
+     * (i.e. have we loaded CR3 since we last did this?) */
+    if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
+        local_flush_tlb_one(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
     highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
     lowmem_l3tab  = cache->table[cache->inuse_idx];
     memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
     *(fix_pae_highmem_pl1e - cpu) = l1e_empty();
-
-    /* Install the low-memory L3 table in CR3. */
-    write_cr3(__pa(lowmem_l3tab));
+    this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
+
+    v->arch.cr3 = __pa(lowmem_l3tab);
 
     spin_unlock(&cache->lock);
 }
 
 #else /* !CONFIG_X86_PAE */
 
-static void __write_ptbase(unsigned long mfn)
-{
-    write_cr3(mfn << PAGE_SHIFT);
+void make_cr3(struct vcpu *v, unsigned long mfn)
+{
+    v->arch.cr3 = mfn << PAGE_SHIFT;
 }
 
 #endif /* !CONFIG_X86_PAE */
 
 void write_ptbase(struct vcpu *v)
 {
-    __write_ptbase(pagetable_get_pfn(v->arch.monitor_table));
+    write_cr3(v->arch.cr3);
 }
 
 void invalidate_shadow_ldt(struct vcpu *v)
@@ -423,8 +424,6 @@ int map_ldt_shadow_page(unsigned int off
 
     BUG_ON(unlikely(in_irq()));
 
-    shadow_sync_va(v, gva);
-
     TOGGLE_MODE();
     __copy_from_user(&l1e, &linear_pg_table[l1_linear_offset(gva)],
                      sizeof(l1e));
@@ -440,12 +439,12 @@ int map_ldt_shadow_page(unsigned int off
 
     res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
 
-    if ( !res && unlikely(shadow_mode_refcounts(d)) )
-    {
-        shadow_lock(d);
-        shadow_remove_all_write_access(d, gmfn, mfn);
+    if ( !res && unlikely(shadow2_mode_refcounts(d)) )
+    {
+        shadow2_lock(d);
+        shadow2_remove_write_access(d->vcpu[0], _mfn(mfn), 0, 0);
         res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
-        shadow_unlock(d);
+        shadow2_unlock(d);
     }
 
     if ( unlikely(!res) )
@@ -513,7 +512,7 @@ get_linear_pagetable(
     struct page_info *page;
     unsigned long pfn;
 
-    ASSERT( !shadow_mode_refcounts(d) );
+    ASSERT( !shadow2_mode_refcounts(d) );
 
     if ( (root_get_flags(re) & _PAGE_RW) )
     {
@@ -576,7 +575,8 @@ get_page_from_l1e(
 
         if ( !iomem_access_permitted(d, mfn, mfn) )
         {
-            MEM_LOG("Non-privileged attempt to map I/O space %08lx", mfn);
+            MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx", 
+                    d->domain_id, mfn);
             return 0;
         }
 
@@ -587,9 +587,14 @@ get_page_from_l1e(
         d = dom_io;
     }
 
-    okay = ((l1e_get_flags(l1e) & _PAGE_RW) ?
-            get_page_and_type(page, d, PGT_writable_page) :
-            get_page(page, d));
+    /* Foreign mappings into guests in shadow2 external mode don't
+     * contribute to writeable mapping refcounts.  (This allows the
+     * qemu-dm helper process in dom0 to map the domain's memory without
+     * messing up the count of "real" writable mappings.) */
+    okay = (((l1e_get_flags(l1e) & _PAGE_RW) && 
+             !(unlikely(shadow2_mode_external(d) && (d != current->domain))))
+            ? get_page_and_type(page, d, PGT_writable_page)
+            : get_page(page, d));
     if ( !okay )
     {
         MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
@@ -609,8 +614,6 @@ get_page_from_l2e(
     struct domain *d, unsigned long vaddr)
 {
     int rc;
-
-    ASSERT(!shadow_mode_refcounts(d));
 
     if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
         return 1;
@@ -641,8 +644,6 @@ get_page_from_l3e(
 {
     int rc;
 
-    ASSERT(!shadow_mode_refcounts(d));
-
     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
         return 1;
 
@@ -668,8 +669,6 @@ get_page_from_l4e(
     struct domain *d, unsigned long vaddr)
 {
     int rc;
-
-    ASSERT( !shadow_mode_refcounts(d) );
 
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
         return 1;
@@ -727,7 +726,10 @@ void put_page_from_l1e(l1_pgentry_t l1e,
         domain_crash(d);
     }
 
-    if ( l1e_get_flags(l1e) & _PAGE_RW )
+    /* Remember we didn't take a type-count of foreign writable mappings
+     * to shadow2 external domains */
+    if ( (l1e_get_flags(l1e) & _PAGE_RW) && 
+         !(unlikely((e != d) && shadow2_mode_external(e))) )
     {
         put_page_and_type(page);
     }
@@ -784,7 +786,7 @@ static int alloc_l1_table(struct page_in
     l1_pgentry_t  *pl1e;
     int            i;
 
-    ASSERT(!shadow_mode_refcounts(d));
+    ASSERT(!shadow2_mode_refcounts(d));
 
     pl1e = map_domain_page(pfn);
 
@@ -832,6 +834,8 @@ static int create_pae_xen_mappings(l3_pg
      *  2. Cannot appear in another page table's L3:
      *     a. alloc_l3_table() calls this function and this check will fail
      *     b. mod_l3_entry() disallows updates to slot 3 in an existing table
+     *
+     * XXX -- this needs revisiting for shadow2_mode_refcount()==true...
      */
     page = l3e_get_page(l3e3);
     BUG_ON(page->u.inuse.type_info & PGT_pinned);
@@ -955,11 +959,7 @@ static int alloc_l2_table(struct page_in
     l2_pgentry_t  *pl2e;
     int            i;
 
-    /* See the code in shadow_promote() to understand why this is here. */
-    if ( (PGT_base_page_table == PGT_l2_page_table) &&
-         unlikely(shadow_mode_refcounts(d)) )
-        return 1;
-    ASSERT(!shadow_mode_refcounts(d));
+    ASSERT(!shadow2_mode_refcounts(d));
     
     pl2e = map_domain_page(pfn);
 
@@ -1009,11 +1009,7 @@ static int alloc_l3_table(struct page_in
     l3_pgentry_t  *pl3e;
     int            i;
 
-    /* See the code in shadow_promote() to understand why this is here. */
-    if ( (PGT_base_page_table == PGT_l3_page_table) &&
-         shadow_mode_refcounts(d) )
-        return 1;
-    ASSERT(!shadow_mode_refcounts(d));
+    ASSERT(!shadow2_mode_refcounts(d));
 
 #ifdef CONFIG_X86_PAE
     /*
@@ -1072,11 +1068,7 @@ static int alloc_l4_table(struct page_in
     unsigned long vaddr;
     int            i;
 
-    /* See the code in shadow_promote() to understand why this is here. */
-    if ( (PGT_base_page_table == PGT_l4_page_table) &&
-         shadow_mode_refcounts(d) )
-        return 1;
-    ASSERT(!shadow_mode_refcounts(d));
+    ASSERT(!shadow2_mode_refcounts(d));
 
     for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
     {
@@ -1183,51 +1175,61 @@ static void free_l4_table(struct page_in
 
 static inline int update_l1e(l1_pgentry_t *pl1e, 
                              l1_pgentry_t  ol1e, 
-                             l1_pgentry_t  nl1e)
-{
+                             l1_pgentry_t  nl1e,
+                             unsigned long gl1mfn,
+                             struct vcpu *v)
+{
+    int rv = 1;
+    if ( unlikely(shadow2_mode_enabled(v->domain)) )
+        shadow2_lock(v->domain);
 #ifndef PTE_UPDATE_WITH_CMPXCHG
-    return !__copy_to_user(pl1e, &nl1e, sizeof(nl1e));
+    rv = (!__copy_to_user(pl1e, &nl1e, sizeof(nl1e)));
 #else
-    intpte_t o = l1e_get_intpte(ol1e);
-    intpte_t n = l1e_get_intpte(nl1e);
-
-    for ( ; ; )
-    {
-        if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) )
-        {
-            MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
-                    ": saw %" PRIpte,
-                    l1e_get_intpte(ol1e),
-                    l1e_get_intpte(nl1e),
-                    o);
-            return 0;
-        }
-
-        if ( o == l1e_get_intpte(ol1e) )
-            break;
-
-        /* Allowed to change in Accessed/Dirty flags only. */
-        BUG_ON((o ^ l1e_get_intpte(ol1e)) &
-               ~(int)(_PAGE_ACCESSED|_PAGE_DIRTY));
-        ol1e = l1e_from_intpte(o);
-    }
-
-    return 1;
+    {
+        intpte_t o = l1e_get_intpte(ol1e);
+        intpte_t n = l1e_get_intpte(nl1e);
+        
+        for ( ; ; )
+        {
+            if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) )
+            {
+                MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
+                        ": saw %" PRIpte,
+                        l1e_get_intpte(ol1e),
+                        l1e_get_intpte(nl1e),
+                        o);
+                rv = 0;
+                break;
+            }
+
+            if ( o == l1e_get_intpte(ol1e) )
+                break;
+
+            /* Allowed to change in Accessed/Dirty flags only. */
+            BUG_ON((o ^ l1e_get_intpte(ol1e)) &
+                   ~(int)(_PAGE_ACCESSED|_PAGE_DIRTY));
+            ol1e = l1e_from_intpte(o);
+        }
+    }
 #endif
+    if ( unlikely(shadow2_mode_enabled(v->domain)) )
+    {
+        shadow2_validate_guest_entry(v, _mfn(gl1mfn), pl1e);
+        shadow2_unlock(v->domain);    
+    }
+    return rv;
 }
 
 
 /* Update the L1 entry at pl1e to new value nl1e. */
-static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
+static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e, 
+                        unsigned long gl1mfn)
 {
     l1_pgentry_t ol1e;
     struct domain *d = current->domain;
 
     if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
         return 0;
-
-    if ( unlikely(shadow_mode_refcounts(d)) )
-        return update_l1e(pl1e, ol1e, nl1e);
 
     if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
     {
@@ -1239,13 +1241,13 @@ static int mod_l1_entry(l1_pgentry_t *pl
         }
 
         /* Fast path for identical mapping, r/w and presence. */
-        if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT))
-            return update_l1e(pl1e, ol1e, nl1e);
+        if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
+            return update_l1e(pl1e, ol1e, nl1e, gl1mfn, current);
 
         if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
             return 0;
         
-        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
+        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e, gl1mfn, current)) )
         {
             put_page_from_l1e(nl1e, d);
             return 0;
@@ -1253,7 +1255,7 @@ static int mod_l1_entry(l1_pgentry_t *pl
     }
     else
     {
-        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
+        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e, gl1mfn, current)) )
             return 0;
     }
 
@@ -1262,9 +1264,9 @@ static int mod_l1_entry(l1_pgentry_t *pl
 }
 
 #ifndef PTE_UPDATE_WITH_CMPXCHG
-#define UPDATE_ENTRY(_t,_p,_o,_n) ({ (*(_p) = (_n)); 1; })
+#define _UPDATE_ENTRY(_t,_p,_o,_n) ({ (*(_p) = (_n)); 1; })
 #else
-#define UPDATE_ENTRY(_t,_p,_o,_n) ({                            \
+#define _UPDATE_ENTRY(_t,_p,_o,_n) ({                            \
     for ( ; ; )                                                 \
     {                                                           \
         intpte_t __o = cmpxchg((intpte_t *)(_p),                \
@@ -1279,6 +1281,18 @@ static int mod_l1_entry(l1_pgentry_t *pl
     }                                                           \
     1; })
 #endif
+#define UPDATE_ENTRY(_t,_p,_o,_n,_m)  ({                            \
+    int rv;                                                         \
+    if ( unlikely(shadow2_mode_enabled(current->domain)) )          \
+        shadow2_lock(current->domain);                              \
+    rv = _UPDATE_ENTRY(_t, _p, _o, _n);                             \
+    if ( unlikely(shadow2_mode_enabled(current->domain)) )          \
+    {                                                               \
+        shadow2_validate_guest_entry(current, _mfn(_m), (_p));      \
+        shadow2_unlock(current->domain);                            \
+    }                                                               \
+    rv;                                                             \
+})
 
 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
 static int mod_l2_entry(l2_pgentry_t *pl2e, 
@@ -1309,19 +1323,19 @@ static int mod_l2_entry(l2_pgentry_t *pl
 
         /* Fast path for identical mapping and presence. */
         if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
-            return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e);
+            return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn);
 
         if ( unlikely(!l1_backptr(&vaddr, pgentry_ptr_to_slot(pl2e), type)) ||
              unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, vaddr)) )
             return 0;
 
-        if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
+        if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn)) )
         {
             put_page_from_l2e(nl2e, pfn);
             return 0;
         }
     }
-    else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
+    else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn)) )
     {
         return 0;
     }
@@ -1329,7 +1343,6 @@ static int mod_l2_entry(l2_pgentry_t *pl
     put_page_from_l2e(ol2e, pfn);
     return 1;
 }
-
 
 #if CONFIG_PAGING_LEVELS >= 3
 
@@ -1356,7 +1369,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
      */
     if ( pgentry_ptr_to_slot(pl3e) >= 3 )
         return 0;
-#endif
+#endif 
 
     if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
         return 0;
@@ -1372,7 +1385,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
 
         /* Fast path for identical mapping and presence. */
         if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
-            return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e);
+            return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn);
 
 #if CONFIG_PAGING_LEVELS >= 4
         if ( unlikely(!l2_backptr(&vaddr, pgentry_ptr_to_slot(pl3e), type)) ||
@@ -1383,15 +1396,15 @@ static int mod_l3_entry(l3_pgentry_t *pl
             << L3_PAGETABLE_SHIFT;
         if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
             return 0;
-#endif
-
-        if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
+#endif 
+
+        if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn)) )
         {
             put_page_from_l3e(nl3e, pfn);
             return 0;
         }
     }
-    else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
+    else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn)) )
     {
         return 0;
     }
@@ -1438,19 +1451,19 @@ static int mod_l4_entry(l4_pgentry_t *pl
 
         /* Fast path for identical mapping and presence. */
         if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
-            return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e);
+            return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn);
 
         if ( unlikely(!l3_backptr(&vaddr, pgentry_ptr_to_slot(pl4e), type)) ||
              unlikely(!get_page_from_l4e(nl4e, pfn, current->domain, vaddr)) )
             return 0;
 
-        if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
+        if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn)) )
         {
             put_page_from_l4e(nl4e, pfn);
             return 0;
         }
     }
-    else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
+    else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn)) )
     {
         return 0;
     }
@@ -1506,18 +1519,21 @@ void free_page_type(struct page_info *pa
          */
         this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS;
 
-        if ( unlikely(shadow_mode_enabled(owner)) )
+        if ( unlikely(shadow2_mode_enabled(owner)
+                 && !shadow2_lock_is_acquired(owner)) )
         {
             /* Raw page tables are rewritten during save/restore. */
-            if ( !shadow_mode_translate(owner) )
+            if ( !shadow2_mode_translate(owner) )
                 mark_dirty(owner, page_to_mfn(page));
 
-            if ( shadow_mode_refcounts(owner) )
+            if ( shadow2_mode_refcounts(owner) )
                 return;
 
             gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
             ASSERT(VALID_M2P(gmfn));
-            remove_shadow(owner, gmfn, type & PGT_type_mask);
+            shadow2_lock(owner);
+            shadow2_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
+            shadow2_unlock(owner);
         }
     }
 
@@ -1573,9 +1589,6 @@ void put_page_type(struct page_info *pag
 
         if ( unlikely((nx & PGT_count_mask) == 0) )
         {
-            /* Record TLB information for flush later. Races are harmless. */
-            page->tlbflush_timestamp = tlbflush_current_time();
-            
             if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
                  likely(nx & PGT_validated) )
             {
@@ -1593,6 +1606,9 @@ void put_page_type(struct page_info *pag
                 x  &= ~PGT_validated;
                 nx &= ~PGT_validated;
             }
+
+            /* Record TLB information for flush later. */
+            page->tlbflush_timestamp = tlbflush_current_time();
         }
         else if ( unlikely((nx & (PGT_pinned|PGT_type_mask|PGT_count_mask)) == 
                            (PGT_pinned|PGT_l1_page_table|1)) )
@@ -1682,7 +1698,7 @@ int get_page_type(struct page_info *page
 #endif
                     /* Fixme: add code to propagate va_unknown to subtables. */
                     if ( ((type & PGT_type_mask) >= PGT_l2_page_table) &&
-                         !shadow_mode_refcounts(page_get_owner(page)) )
+                         !shadow2_mode_refcounts(page_get_owner(page)) )
                         return 0;
                     /* This table is possibly mapped at multiple locations. */
                     nx &= ~PGT_va_mask;
@@ -1729,7 +1745,10 @@ int new_guest_cr3(unsigned long mfn)
     int okay;
     unsigned long old_base_mfn;
 
-    if ( shadow_mode_refcounts(d) )
+    if ( hvm_guest(v) && !hvm_paging_enabled(v) )
+        domain_crash_synchronous();
+
+    if ( shadow2_mode_refcounts(d) )
     {
         okay = get_page_from_pagenr(mfn, d);
         if ( unlikely(!okay) )
@@ -1747,7 +1766,7 @@ int new_guest_cr3(unsigned long mfn)
             MEM_LOG("New baseptr %lx: slow path via idle pagetables", mfn);
             old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
             v->arch.guest_table = pagetable_null();
-            update_pagetables(v);
+            update_cr3(v);
             write_cr3(__pa(idle_pg_table));
             if ( old_base_mfn != 0 )
                 put_page_and_type(mfn_to_page(old_base_mfn));
@@ -1769,30 +1788,20 @@ int new_guest_cr3(unsigned long mfn)
     invalidate_shadow_ldt(v);
 
     old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
+
     v->arch.guest_table = pagetable_from_pfn(mfn);
-    update_pagetables(v); /* update shadow_table and monitor_table */
+    update_cr3(v); /* update shadow_table and cr3 fields of vcpu struct */
 
     write_ptbase(v);
 
     if ( likely(old_base_mfn != 0) )
     {
-        if ( shadow_mode_refcounts(d) )
+        if ( shadow2_mode_refcounts(d) )
             put_page(mfn_to_page(old_base_mfn));
         else
             put_page_and_type(mfn_to_page(old_base_mfn));
     }
 
-    /* CR3 also holds a ref to its shadow... */
-    if ( shadow_mode_enabled(d) )
-    {
-        if ( v->arch.monitor_shadow_ref )
-            put_shadow_ref(v->arch.monitor_shadow_ref);
-        v->arch.monitor_shadow_ref =
-            pagetable_get_pfn(v->arch.monitor_table);
-        ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref)));
-        get_shadow_ref(v->arch.monitor_shadow_ref);
-    }
-
     return 1;
 }
 
@@ -1807,8 +1816,6 @@ static void process_deferred_ops(void)
 
     if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
     {
-        if ( shadow_mode_enabled(d) )
-            shadow_sync_all(d);
         if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
             flush_tlb_mask(d->domain_dirty_cpumask);
         else
@@ -1974,7 +1981,7 @@ int do_mmuext_op(
             type = PGT_root_page_table;
 
         pin_page:
-            if ( shadow_mode_refcounts(FOREIGNDOM) )
+            if ( shadow2_mode_refcounts(FOREIGNDOM) )
                 break;
 
             okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
@@ -1996,7 +2003,7 @@ int do_mmuext_op(
             break;
 
         case MMUEXT_UNPIN_TABLE:
-            if ( shadow_mode_refcounts(d) )
+            if ( shadow2_mode_refcounts(d) )
                 break;
 
             if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
@@ -2009,6 +2016,12 @@ int do_mmuext_op(
             {
                 put_page_and_type(page);
                 put_page(page);
+                if ( shadow2_mode_enabled(d) )
+                {
+                    shadow2_lock(d);
+                    shadow2_remove_all_shadows(v, _mfn(mfn));
+                    shadow2_unlock(d);
+                }
             }
             else
             {
@@ -2050,9 +2063,9 @@ int do_mmuext_op(
             break;
     
         case MMUEXT_INVLPG_LOCAL:
-            if ( shadow_mode_enabled(d) )
-                shadow_invlpg(v, op.arg1.linear_addr);
-            local_flush_tlb_one(op.arg1.linear_addr);
+            if ( !shadow2_mode_enabled(d) 
+                 || shadow2_invlpg(v, op.arg1.linear_addr) != 0 )
+                local_flush_tlb_one(op.arg1.linear_addr);
             break;
 
         case MMUEXT_TLB_FLUSH_MULTI:
@@ -2098,7 +2111,7 @@ int do_mmuext_op(
             unsigned long ptr  = op.arg1.linear_addr;
             unsigned long ents = op.arg2.nr_ents;
 
-            if ( shadow_mode_external(d) )
+            if ( shadow2_mode_external(d) )
             {
                 MEM_LOG("ignoring SET_LDT hypercall from external "
                         "domain %u", d->domain_id);
@@ -2171,9 +2184,6 @@ int do_mmu_update(
 
     LOCK_BIGLOCK(d);
 
-    if ( unlikely(shadow_mode_enabled(d)) )
-        check_pagetable(v, "pre-mmu"); /* debug */
-
     if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
     {
         count &= ~MMU_UPDATE_PREEMPTED;
@@ -2248,7 +2258,12 @@ int do_mmu_update(
             case PGT_l3_page_table:
             case PGT_l4_page_table:
             {
-                ASSERT(!shadow_mode_refcounts(d));
+                if ( shadow2_mode_refcounts(d) )
+                {
+                    DPRINTK("mmu update on shadow-refcounted domain!");
+                    break;
+                }
+
                 if ( unlikely(!get_page_type(
                     page, type_info & (PGT_type_mask|PGT_va_mask))) )
                     goto not_a_pt;
@@ -2258,10 +2273,7 @@ int do_mmu_update(
                 case PGT_l1_page_table:
                 {
                     l1_pgentry_t l1e = l1e_from_intpte(req.val);
-                    okay = mod_l1_entry(va, l1e);
-                    if ( okay && unlikely(shadow_mode_enabled(d)) )
-                        shadow_l1_normal_pt_update(
-                            d, req.ptr, l1e, &sh_mapcache);
+                    okay = mod_l1_entry(va, l1e, mfn);
                 }
                 break;
                 case PGT_l2_page_table:
@@ -2269,9 +2281,6 @@ int do_mmu_update(
                     l2_pgentry_t l2e = l2e_from_intpte(req.val);
                     okay = mod_l2_entry(
                         (l2_pgentry_t *)va, l2e, mfn, type_info);
-                    if ( okay && unlikely(shadow_mode_enabled(d)) )
-                        shadow_l2_normal_pt_update(
-                            d, req.ptr, l2e, &sh_mapcache);
                 }
                 break;
 #if CONFIG_PAGING_LEVELS >= 3
@@ -2279,9 +2288,6 @@ int do_mmu_update(
                 {
                     l3_pgentry_t l3e = l3e_from_intpte(req.val);
                     okay = mod_l3_entry(va, l3e, mfn, type_info);
-                    if ( okay && unlikely(shadow_mode_enabled(d)) )
-                        shadow_l3_normal_pt_update(
-                            d, req.ptr, l3e, &sh_mapcache);
                 }
                 break;
 #endif
@@ -2290,9 +2296,6 @@ int do_mmu_update(
                 {
                     l4_pgentry_t l4e = l4e_from_intpte(req.val);
                     okay = mod_l4_entry(va, l4e, mfn, type_info);
-                    if ( okay && unlikely(shadow_mode_enabled(d)) )
-                        shadow_l4_normal_pt_update(
-                            d, req.ptr, l4e, &sh_mapcache);
                 }
                 break;
 #endif
@@ -2308,19 +2311,17 @@ int do_mmu_update(
                 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
                     break;
 
-                if ( shadow_mode_enabled(d) )
-                {
-                    shadow_lock(d);
-                    __mark_dirty(d, mfn);
-                    if ( page_is_page_table(page) && !page_out_of_sync(page) )
-                        shadow_mark_mfn_out_of_sync(v, gmfn, mfn);
-                }
+                if ( unlikely(shadow2_mode_enabled(d)) )
+                    shadow2_lock(d);
 
                 *(intpte_t *)va = req.val;
                 okay = 1;
 
-                if ( shadow_mode_enabled(d) )
-                    shadow_unlock(d);
+                if ( unlikely(shadow2_mode_enabled(d)) )
+                {
+                    shadow2_validate_guest_entry(v, _mfn(mfn), va);
+                    shadow2_unlock(d);
+                }
 
                 put_page_type(page);
             }
@@ -2333,12 +2334,6 @@ int do_mmu_update(
             break;
 
         case MMU_MACHPHYS_UPDATE:
-
-            if ( shadow_mode_translate(FOREIGNDOM) )
-            {
-                MEM_LOG("can't mutate m2p table of translate mode guest");
-                break;
-            }
 
             mfn = req.ptr >> PAGE_SHIFT;
             gpfn = req.val;
@@ -2349,9 +2344,13 @@ int do_mmu_update(
                 break;
             }
 
-            set_gpfn_from_mfn(mfn, gpfn);
+            if ( shadow2_mode_translate(FOREIGNDOM) )
+                shadow2_guest_physmap_add_page(FOREIGNDOM, gpfn, mfn);
+            else 
+                set_gpfn_from_mfn(mfn, gpfn);
             okay = 1;
 
+            // Mark the new gfn dirty...
             mark_dirty(FOREIGNDOM, mfn);
 
             put_page(mfn_to_page(mfn));
@@ -2381,9 +2380,6 @@ int do_mmu_update(
     done += i;
     if ( unlikely(!guest_handle_is_null(pdone)) )
         copy_to_guest(pdone, &done, 1);
-
-    if ( unlikely(shadow_mode_enabled(d)) )
-        check_pagetable(v, "post-mmu"); /* debug */
 
     UNLOCK_BIGLOCK(d);
     return rc;
@@ -2402,7 +2398,6 @@ static int create_grant_pte_mapping(
     struct domain *d = v->domain;
 
     ASSERT(spin_is_locked(&d->big_lock));
-    ASSERT(!shadow_mode_refcounts(d));
 
     gmfn = pte_addr >> PAGE_SHIFT;
     mfn = gmfn_to_mfn(d, gmfn);
@@ -2418,7 +2413,7 @@ static int create_grant_pte_mapping(
     page = mfn_to_page(mfn);
 
     type_info = page->u.inuse.type_info;
-    if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
+    if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||         
          !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
     {
         MEM_LOG("Grant map attempted to update a non-L1 page");
@@ -2427,28 +2422,22 @@ static int create_grant_pte_mapping(
     }
 
     ol1e = *(l1_pgentry_t *)va;
-    if ( !update_l1e(va, ol1e, _nl1e) )
+    if ( !update_l1e(va, ol1e, _nl1e, mfn, v) )
     {
         put_page_type(page);
         rc = GNTST_general_error;
         goto failed;
     } 
 
-    put_page_from_l1e(ol1e, d);
-
-    if ( unlikely(shadow_mode_enabled(d)) )
-    {
-        struct domain_mmap_cache sh_mapcache;
-        domain_mmap_cache_init(&sh_mapcache);
-        shadow_l1_normal_pt_update(d, pte_addr, _nl1e, &sh_mapcache);
-        domain_mmap_cache_destroy(&sh_mapcache);
-    }
+    if ( !shadow2_mode_refcounts(d) )
+        put_page_from_l1e(ol1e, d);
 
     put_page_type(page);
  
  failed:
     unmap_domain_page(va);
     put_page(page);
+
     return rc;
 }
 
@@ -2462,8 +2451,6 @@ static int destroy_grant_pte_mapping(
     u32 type_info;
     l1_pgentry_t ol1e;
 
-    ASSERT(!shadow_mode_refcounts(d));
-
     gmfn = addr >> PAGE_SHIFT;
     mfn = gmfn_to_mfn(d, gmfn);
 
@@ -2504,7 +2491,9 @@ static int destroy_grant_pte_mapping(
     }
 
     /* Delete pagetable entry. */
-    if ( unlikely(!update_l1e((l1_pgentry_t *)va, ol1e, l1e_empty())) )
+    if ( unlikely(!update_l1e(
+                      (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn, 
+                      d->vcpu[0] /* Change if we go to per-vcpu shadows. */)) )
     {
         MEM_LOG("Cannot delete PTE entry at %p", va);
         put_page_type(page);
@@ -2512,14 +2501,6 @@ static int destroy_grant_pte_mapping(
         goto failed;
     }
 
-    if ( unlikely(shadow_mode_enabled(d)) )
-    {
-        struct domain_mmap_cache sh_mapcache;
-        domain_mmap_cache_init(&sh_mapcache);
-        shadow_l1_normal_pt_update(d, addr, l1e_empty(), &sh_mapcache);
-        domain_mmap_cache_destroy(&sh_mapcache);
-    }
-
     put_page_type(page);
 
  failed:
@@ -2536,31 +2517,22 @@ static int create_grant_va_mapping(
     struct domain *d = v->domain;
     
     ASSERT(spin_is_locked(&d->big_lock));
-    ASSERT(!shadow_mode_refcounts(d));
-
-    /*
-     * This is actually overkill - we don't need to sync the L1 itself,
-     * just everything involved in getting to this L1 (i.e. we need
-     * linear_pg_table[l1_linear_offset(va)] to be in sync)...
-     */
-    __shadow_sync_va(v, va);
 
     pl1e = &linear_pg_table[l1_linear_offset(va)];
 
     if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ||
-         !update_l1e(pl1e, ol1e, _nl1e) )
+         !update_l1e(pl1e, ol1e, _nl1e, 
+                    l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]), v) )
         return GNTST_general_error;
 
-    put_page_from_l1e(ol1e, d);
-
-    if ( unlikely(shadow_mode_enabled(d)) )
-        shadow_do_update_va_mapping(va, _nl1e, v);
+    if ( !shadow2_mode_refcounts(d) )
+        put_page_from_l1e(ol1e, d);
 
     return GNTST_okay;
 }
 
 static int destroy_grant_va_mapping(
-    unsigned long addr, unsigned long frame)
+    unsigned long addr, unsigned long frame, struct domain *d)
 {
     l1_pgentry_t *pl1e, ol1e;
     
@@ -2584,12 +2556,14 @@ static int destroy_grant_va_mapping(
     }
 
     /* Delete pagetable entry. */
-    if ( unlikely(!update_l1e(pl1e, ol1e, l1e_empty())) )
+    if ( unlikely(!update_l1e(pl1e, ol1e, l1e_empty(), 
+                      l2e_get_pfn(__linear_l2_table[l2_linear_offset(addr)]),
+                      d->vcpu[0] /* Change for per-vcpu shadows */)) )
     {
         MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
         return GNTST_general_error;
     }
-    
+
     return 0;
 }
 
@@ -2597,7 +2571,7 @@ int create_grant_host_mapping(
     unsigned long addr, unsigned long frame, unsigned int flags)
 {
     l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
-        
+
     if ( (flags & GNTMAP_application_map) )
         l1e_add_flags(pte,_PAGE_USER);
     if ( !(flags & GNTMAP_readonly) )
@@ -2613,7 +2587,7 @@ int destroy_grant_host_mapping(
 {
     if ( flags & GNTMAP_contains_pte )
         return destroy_grant_pte_mapping(addr, frame, current->domain);
-    return destroy_grant_va_mapping(addr, frame);
+    return destroy_grant_va_mapping(addr, frame, current->domain);
 }
 
 int steal_page(
@@ -2675,46 +2649,44 @@ int do_update_va_mapping(unsigned long v
 
     perfc_incrc(calls_to_update_va);
 
-    if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
+    if ( unlikely(!__addr_ok(va) && !shadow2_mode_external(d)) )
         return -EINVAL;
 
+    if ( unlikely(shadow2_mode_refcounts(d)) )
+    {
+        DPRINTK("Grant op on a shadow-refcounted domain\n");
+        return -EINVAL; 
+    }
+
     LOCK_BIGLOCK(d);
 
-    if ( unlikely(shadow_mode_enabled(d)) )
-        check_pagetable(v, "pre-va"); /* debug */
-
-    if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
-                                val)) )
-        rc = -EINVAL;
-
-    if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
+    if ( likely(rc == 0) && unlikely(shadow2_mode_enabled(d)) )
     {
         if ( unlikely(this_cpu(percpu_mm_info).foreign &&
-                      (shadow_mode_translate(d) ||
-                       shadow_mode_translate(
+                      (shadow2_mode_translate(d) ||
+                       shadow2_mode_translate(
                            this_cpu(percpu_mm_info).foreign))) )
         {
             /*
              * The foreign domain's pfn's are in a different namespace. There's
-             * not enough information in just a gpte to figure out how to
+             * not enough information in just a gpte to figure out how to   
              * (re-)shadow this entry.
              */
             domain_crash(d);
         }
+    }
+
+    if ( unlikely(!mod_l1_entry(
+                      &linear_pg_table[l1_linear_offset(va)], val,
+                      l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]))) )
+        rc = -EINVAL;
     
-        rc = shadow_do_update_va_mapping(va, val, v);
-
-        check_pagetable(v, "post-va"); /* debug */
-    }
-
     switch ( flags & UVMF_FLUSHTYPE_MASK )
     {
     case UVMF_TLB_FLUSH:
         switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
         {
         case UVMF_LOCAL:
-            if ( unlikely(shadow_mode_enabled(d)) )
-                shadow_sync_all(d);
             local_flush_tlb();
             break;
         case UVMF_ALL:
@@ -2733,9 +2705,9 @@ int do_update_va_mapping(unsigned long v
         switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
         {
         case UVMF_LOCAL:
-            if ( unlikely(shadow_mode_enabled(d)) )
-                shadow_invlpg(current, va);
-            local_flush_tlb_one(va);
+            if ( !shadow2_mode_enabled(d) 
+                 || (shadow2_invlpg(current, va) != 0) ) 
+                local_flush_tlb_one(va);
             break;
         case UVMF_ALL:
             flush_tlb_one_mask(d->domain_dirty_cpumask, va);
@@ -2807,8 +2779,6 @@ long set_gdt(struct vcpu *v,
 
     if ( entries > FIRST_RESERVED_GDT_ENTRY )
         return -EINVAL;
-
-    shadow_sync_all(d);
 
     /* Check the pages in the new GDT. */
     for ( i = 0; i < nr_pages; i++ ) {
@@ -2912,24 +2882,13 @@ long do_update_descriptor(u64 pa, u64 de
         break;
     }
 
-    if ( shadow_mode_enabled(dom) )
-    {
-        shadow_lock(dom);
-
-        __mark_dirty(dom, mfn);
-
-        if ( page_is_page_table(page) && !page_out_of_sync(page) )
-            shadow_mark_mfn_out_of_sync(current, gmfn, mfn);
-    }
+    mark_dirty(dom, mfn);
 
     /* All is good so make the update. */
     gdt_pent = map_domain_page(mfn);
     memcpy(&gdt_pent[offset], &d, 8);
     unmap_domain_page(gdt_pent);
 
-    if ( shadow_mode_enabled(dom) )
-        shadow_unlock(dom);
-
     put_page_type(page);
 
     ret = 0; /* success */
@@ -2981,8 +2940,8 @@ long arch_memory_op(int op, XEN_GUEST_HA
         default:
             break;
         }
-        
-        if ( !shadow_mode_translate(d) || (mfn == 0) )
+
+        if ( !shadow2_mode_translate(d) || (mfn == 0) )
         {
             put_domain(d);
             return -EINVAL;
@@ -3011,7 +2970,7 @@ long arch_memory_op(int op, XEN_GUEST_HA
         guest_physmap_add_page(d, xatp.gpfn, mfn);
 
         UNLOCK_BIGLOCK(d);
-
+        
         put_domain(d);
 
         break;
@@ -3136,7 +3095,8 @@ static int ptwr_emulated_update(
     unsigned long pfn;
     struct page_info *page;
     l1_pgentry_t pte, ol1e, nl1e, *pl1e;
-    struct domain *d = current->domain;
+    struct vcpu *v = current;
+    struct domain *d = v->domain;
 
     /* Aligned access only, thank you. */
     if ( !access_ok(addr, bytes) || ((addr & (bytes-1)) != 0) )
@@ -3196,25 +3156,36 @@ static int ptwr_emulated_update(
         return X86EMUL_UNHANDLEABLE;
     }
 
+
     /* Checked successfully: do the update (write or cmpxchg). */
     pl1e = map_domain_page(page_to_mfn(page));
     pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
     if ( do_cmpxchg )
     {
+        if ( shadow2_mode_enabled(d) )
+            shadow2_lock(d);
         ol1e = l1e_from_intpte(old);
         if ( cmpxchg((intpte_t *)pl1e, old, val) != old )
         {
+            if ( shadow2_mode_enabled(d) )
+                shadow2_unlock(d);
             unmap_domain_page(pl1e);
             put_page_from_l1e(nl1e, d);
             return X86EMUL_CMPXCHG_FAILED;
         }
+        if ( unlikely(shadow2_mode_enabled(v->domain)) )
+        {
+            shadow2_validate_guest_entry(v, _mfn(page_to_mfn(page)), pl1e);
+            shadow2_unlock(v->domain);    
+        }
     }
     else
     {
         ol1e = *pl1e;
-        if ( !update_l1e(pl1e, ol1e, nl1e) )
+        if ( !update_l1e(pl1e, ol1e, nl1e, page_to_mfn(page), v) )
             BUG();
     }
+
     unmap_domain_page(pl1e);
 
     /* Finally, drop the old PTE. */
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/setup.c
--- a/xen/arch/x86/setup.c      Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/setup.c      Wed Aug 16 17:02:35 2006 +0100
@@ -532,8 +532,6 @@ void __init __start_xen(multiboot_info_t
     if ( opt_watchdog ) 
         watchdog_enable();
 
-    shadow_mode_init();
-
     /* initialize access control security module */
     acm_init(&initrdidx, mbi, initial_images_start);
 
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c    Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/smpboot.c    Wed Aug 16 17:02:35 2006 +0100
@@ -896,7 +896,7 @@ static int __devinit do_boot_cpu(int api
        v = alloc_idle_vcpu(cpu);
        BUG_ON(v == NULL);
 
-       v->arch.monitor_table = pagetable_from_paddr(__pa(idle_pg_table));
+       v->arch.cr3 = __pa(idle_pg_table);
 
        /* start_eip had better be page-aligned! */
        start_eip = setup_trampoline();
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c      Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/traps.c      Wed Aug 16 17:02:35 2006 +0100
@@ -277,6 +277,21 @@ void show_stack(struct cpu_user_regs *re
     show_trace(regs);
 }
 
+void show_xen_trace()
+{
+    struct cpu_user_regs regs;
+#ifdef __x86_64
+    __asm__("movq %%rsp,%0" : "=m" (regs.rsp));
+    __asm__("movq %%rbp,%0" : "=m" (regs.rbp));
+    __asm__("leaq 0(%%rip),%0" : "=a" (regs.rip));
+#else
+    __asm__("movl %%esp,%0" : "=m" (regs.esp));
+    __asm__("movl %%ebp,%0" : "=m" (regs.ebp));
+    __asm__("call 1f; 1: popl %0" : "=a" (regs.eip));
+#endif
+    show_trace(&regs);
+}
+
 void show_stack_overflow(unsigned long esp)
 {
 #ifdef MEMORY_GUARD
@@ -861,8 +876,8 @@ static int fixup_page_fault(unsigned lon
 
     if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
     {
-        if ( shadow_mode_external(d) && guest_mode(regs) )
-            return shadow_fault(addr, regs);
+        if ( shadow2_mode_external(d) && guest_mode(regs) )
+            return shadow2_fault(addr, regs);
         if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
             return handle_gdt_ldt_mapping_fault(
                 addr - GDT_LDT_VIRT_START, regs);
@@ -873,14 +888,14 @@ static int fixup_page_fault(unsigned lon
         return (spurious_page_fault(addr, regs) ? EXCRET_not_a_fault : 0);
     }
 
-    if ( unlikely(shadow_mode_enabled(d)) )
-        return shadow_fault(addr, regs);
-
     if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) &&
          guest_kernel_mode(v, regs) &&
          ((regs->error_code & (PGERR_write_access|PGERR_page_present)) ==
           (PGERR_write_access|PGERR_page_present)) )
         return ptwr_do_page_fault(d, addr, regs) ? EXCRET_fault_fixed : 0;
+
+    if ( shadow2_mode_enabled(d) )
+        return shadow2_fault(addr, regs);
 
     return 0;
 }
@@ -905,6 +920,13 @@ asmlinkage int do_page_fault(struct cpu_
     DEBUGGER_trap_entry(TRAP_page_fault, regs);
 
     perfc_incrc(page_faults);
+
+    if ( shadow2_mode_enabled(current->domain) )
+        debugtrace_printk("%s %s %d dom=%d eip=%p cr2=%p code=%d cs=%x\n",
+                          __func__, __FILE__, __LINE__,
+                          current->domain->domain_id,
+                          (void *)regs->eip, (void *)addr, regs->error_code,
+                          regs->cs);
 
     if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) )
         return rc;
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/x86_32/domain_page.c
--- a/xen/arch/x86/x86_32/domain_page.c Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/x86_32/domain_page.c Wed Aug 16 17:02:35 2006 +0100
@@ -15,6 +15,7 @@
 #include <asm/current.h>
 #include <asm/flushtlb.h>
 #include <asm/hardirq.h>
+#include <asm/hvm/support.h>
 
 static inline struct vcpu *mapcache_current_vcpu(void)
 {
@@ -58,10 +59,10 @@ void *map_domain_page(unsigned long pfn)
     cache = &v->domain->arch.mapcache;
 
     hashent = &cache->vcpu_maphash[vcpu].hash[MAPHASH_HASHFN(pfn)];
-    if ( hashent->pfn == pfn )
-    {
-        idx = hashent->idx;
+    if ( hashent->pfn == pfn && (idx = hashent->idx) != MAPHASHENT_NOTINUSE )
+    {
         hashent->refcnt++;
+        ASSERT(idx < MAPCACHE_ENTRIES);
         ASSERT(hashent->refcnt != 0);
         ASSERT(l1e_get_pfn(cache->l1tab[idx]) == pfn);
         goto out;
@@ -178,6 +179,30 @@ void mapcache_init(struct domain *d)
                 MAPHASHENT_NOTINUSE;
 }
 
+paddr_t mapped_domain_page_to_maddr(void *va) 
+/* Convert a pointer in a mapped domain page to a machine address. 
+ * Takes any pointer that's valid for use in unmap_domain_page() */
+{
+    unsigned int idx;
+    struct vcpu *v;
+    struct mapcache *cache;
+    unsigned long pfn;
+
+    ASSERT(!in_irq());
+
+    ASSERT((void *)MAPCACHE_VIRT_START <= va);
+    ASSERT(va < (void *)MAPCACHE_VIRT_END);
+
+    v = mapcache_current_vcpu();
+
+    cache = &v->domain->arch.mapcache;
+
+    idx = ((unsigned long)va - MAPCACHE_VIRT_START) >> PAGE_SHIFT;
+    pfn = l1e_get_pfn(cache->l1tab[idx]);
+    return ((paddr_t) pfn << PAGE_SHIFT 
+            | ((unsigned long) va & ~PAGE_MASK));
+}
+
 #define GLOBALMAP_BITS (IOREMAP_MBYTES << (20 - PAGE_SHIFT))
 static unsigned long inuse[BITS_TO_LONGS(GLOBALMAP_BITS)];
 static unsigned long garbage[BITS_TO_LONGS(GLOBALMAP_BITS)];
@@ -233,6 +258,8 @@ void unmap_domain_page_global(void *va)
     l1_pgentry_t *pl1e;
     unsigned int idx;
 
+    ASSERT((__va >= IOREMAP_VIRT_START) && (__va <= (IOREMAP_VIRT_END - 1)));
+
     /* /First/, we zap the PTE. */
     pl2e = virt_to_xen_l2e(__va);
     pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(__va);
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/x86_32/mm.c
--- a/xen/arch/x86/x86_32/mm.c  Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/x86_32/mm.c  Wed Aug 16 17:02:35 2006 +0100
@@ -75,8 +75,7 @@ void __init paging_init(void)
     printk("PAE disabled.\n");
 #endif
 
-    idle_vcpu[0]->arch.monitor_table =
-        pagetable_from_paddr(__pa(idle_pg_table));
+    idle_vcpu[0]->arch.cr3 = __pa(idle_pg_table);
 
     if ( cpu_has_pge )
     {
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/x86_64/mm.c
--- a/xen/arch/x86/x86_64/mm.c  Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/x86_64/mm.c  Wed Aug 16 17:02:35 2006 +0100
@@ -81,8 +81,7 @@ void __init paging_init(void)
     l2_pgentry_t *l2_ro_mpt;
     struct page_info *pg;
 
-    idle_vcpu[0]->arch.monitor_table =
-        pagetable_from_paddr(__pa(idle_pg_table));
+    idle_vcpu[0]->arch.cr3 = __pa(idle_pg_table);
 
     /* Create user-accessible L2 directory to map the MPT for guests. */
     l3_ro_mpt = alloc_xenheap_page();
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/x86_64/traps.c
--- a/xen/arch/x86/x86_64/traps.c       Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/arch/x86/x86_64/traps.c       Wed Aug 16 17:02:35 2006 +0100
@@ -84,7 +84,8 @@ void show_page_walk(unsigned long addr)
     l4e = l4t[l4_table_offset(addr)];
     mfn = l4e_get_pfn(l4e);
     pfn = get_gpfn_from_mfn(mfn);
-    printk(" L4 = %"PRIpte" %016lx\n", l4e_get_intpte(l4e), pfn);
+    printk(" L4[0x%lx] = %"PRIpte" %016lx\n",
+           l4_table_offset(addr), l4e_get_intpte(l4e), pfn);
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
         return;
 
@@ -92,7 +93,8 @@ void show_page_walk(unsigned long addr)
     l3e = l3t[l3_table_offset(addr)];
     mfn = l3e_get_pfn(l3e);
     pfn = get_gpfn_from_mfn(mfn);
-    printk("  L3 = %"PRIpte" %016lx\n", l3e_get_intpte(l3e), pfn);
+    printk("  L3[0x%lx] = %"PRIpte" %016lx\n",
+           l3_table_offset(addr), l3e_get_intpte(l3e), pfn);
     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
         return;
 
@@ -100,7 +102,8 @@ void show_page_walk(unsigned long addr)
     l2e = l2t[l2_table_offset(addr)];
     mfn = l2e_get_pfn(l2e);
     pfn = get_gpfn_from_mfn(mfn);
-    printk("   L2 = %"PRIpte" %016lx %s\n", l2e_get_intpte(l2e), pfn,
+    printk("   L2[0x%lx] = %"PRIpte" %016lx %s\n",
+           l2_table_offset(addr), l2e_get_intpte(l2e), pfn,
            (l2e_get_flags(l2e) & _PAGE_PSE) ? "(PSE)" : "");
     if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
          (l2e_get_flags(l2e) & _PAGE_PSE) )
@@ -110,7 +113,8 @@ void show_page_walk(unsigned long addr)
     l1e = l1t[l1_table_offset(addr)];
     mfn = l1e_get_pfn(l1e);
     pfn = get_gpfn_from_mfn(mfn);
-    printk("    L1 = %"PRIpte" %016lx\n", l1e_get_intpte(l1e), pfn);
+    printk("    L1[0x%lx] = %"PRIpte" %016lx\n",
+           l1_table_offset(addr), l1e_get_intpte(l1e), pfn);
 }
 
 asmlinkage void double_fault(void);
@@ -162,7 +166,7 @@ void toggle_guest_mode(struct vcpu *v)
 {
     v->arch.flags ^= TF_kernel_mode;
     __asm__ __volatile__ ( "swapgs" );
-    update_pagetables(v);
+    update_cr3(v);
     write_ptbase(v);
 }
 
diff -r fda70200da01 -r 0f917d63e960 xen/common/acm_ops.c
--- a/xen/common/acm_ops.c      Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/common/acm_ops.c      Wed Aug 16 17:02:35 2006 +0100
@@ -26,7 +26,6 @@
 #include <xen/trace.h>
 #include <xen/console.h>
 #include <xen/guest_access.h>
-#include <asm/shadow.h>
 #include <public/sched_ctl.h>
 #include <acm/acm_hooks.h>
 
diff -r fda70200da01 -r 0f917d63e960 xen/common/grant_table.c
--- a/xen/common/grant_table.c  Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/common/grant_table.c  Wed Aug 16 17:02:35 2006 +0100
@@ -434,7 +434,7 @@ __gnttab_unmap_grant_ref(
 
     /* If just unmapped a writable mapping, mark as dirtied */
     if ( !(flags & GNTMAP_readonly) )
-         gnttab_log_dirty(rd, frame);
+         gnttab_mark_dirty(rd, frame);
 
     if ( ((act->pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask)) == 0) &&
          !(flags & GNTMAP_readonly) )
@@ -731,7 +731,7 @@ __release_grant_for_copy(
     const unsigned long r_frame = act->frame;
 
     if ( !readonly )
-        gnttab_log_dirty(rd, r_frame);
+        gnttab_mark_dirty(rd, r_frame);
 
     spin_lock(&rd->grant_table->lock);
     if ( readonly )
diff -r fda70200da01 -r 0f917d63e960 xen/common/keyhandler.c
--- a/xen/common/keyhandler.c   Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/common/keyhandler.c   Wed Aug 16 17:02:35 2006 +0100
@@ -241,9 +241,6 @@ static void read_clocks(unsigned char ke
 }
 
 extern void dump_runq(unsigned char key);
-#ifndef NDEBUG
-extern void audit_domains_key(unsigned char key);
-#endif
 
 #ifdef PERF_COUNTERS
 extern void perfc_printall(unsigned char key);
@@ -261,10 +258,16 @@ static void do_debug_key(unsigned char k
 #ifndef NDEBUG
 static void debugtrace_key(unsigned char key)
 {
-    debugtrace_send_to_console = !debugtrace_send_to_console;
-    debugtrace_dump();
-    printk("debugtrace_printk now writing to %s.\n",
-           debugtrace_send_to_console ? "console" : "buffer");
+    debugtrace_toggle();
+}
+
+static void shadow2_audit_key(unsigned char key)
+{
+    extern int shadow2_audit_enable;
+
+    shadow2_audit_enable = !shadow2_audit_enable;
+    printk("%s shadow2_audit_enable=%d\n",
+           __func__, shadow2_audit_enable);
 }
 #endif
 
@@ -288,7 +291,7 @@ void initialize_keytable(void)
 
 #ifndef NDEBUG
     register_keyhandler(
-        'o', audit_domains_key,  "audit domains >0 EXPERIMENTAL");
+        'O', shadow2_audit_key,  "toggle shadow2 audits");
     register_keyhandler(
         'T', debugtrace_key, "toggle debugtrace to console/buffer");
 #endif
diff -r fda70200da01 -r 0f917d63e960 xen/common/memory.c
--- a/xen/common/memory.c       Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/common/memory.c       Wed Aug 16 17:02:35 2006 +0100
@@ -126,6 +126,11 @@ populate_physmap(
             for ( j = 0; j < (1 << extent_order); j++ )
                 guest_physmap_add_page(d, gpfn + j, mfn + j);
         }
+        else if ( unlikely(shadow2_mode_translate(d)) )
+        {
+            for ( j = 0; j < (1 << extent_order); j++ )
+                shadow2_guest_physmap_add_page(d, gpfn + j, mfn + j);
+        }
         else
         {
             for ( j = 0; j < (1 << extent_order); j++ )
@@ -153,7 +158,7 @@ guest_remove_page(
     if ( unlikely(!mfn_valid(mfn)) )
     {
         DPRINTK("Domain %u page number %lx invalid\n",
-                d->domain_id, mfn);
+                d->domain_id, gmfn);
         return 0;
     }
             
@@ -179,7 +184,7 @@ guest_remove_page(
                 (unsigned long)page->count_info, page->u.inuse.type_info);
     }
 
-    guest_physmap_remove_page(d, gmfn, mfn);
+    shadow2_guest_physmap_remove_page(d, gmfn, mfn);
 
     put_page(page);
 
@@ -250,7 +255,7 @@ translate_gpfn_list(
     if ( (d = find_domain_by_id(op.domid)) == NULL )
         return -ESRCH;
 
-    if ( !shadow_mode_translate(d) )
+    if ( !(shadow_mode_translate(d) || shadow2_mode_translate(d)) )
     {
         put_domain(d);
         return -EINVAL;
diff -r fda70200da01 -r 0f917d63e960 xen/drivers/char/console.c
--- a/xen/drivers/char/console.c        Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/drivers/char/console.c        Wed Aug 16 17:02:35 2006 +0100
@@ -569,7 +569,7 @@ int console_getc(void)
 #ifndef NDEBUG
 
 /* Send output direct to console, or buffer it? */
-int debugtrace_send_to_console;
+static volatile int debugtrace_send_to_console;
 
 static char        *debugtrace_buf; /* Debug-trace buffer */
 static unsigned int debugtrace_prd; /* Producer index     */
@@ -578,16 +578,10 @@ static DEFINE_SPINLOCK(debugtrace_lock);
 static DEFINE_SPINLOCK(debugtrace_lock);
 integer_param("debugtrace", debugtrace_kilobytes);
 
-void debugtrace_dump(void)
-{
-    unsigned long flags;
-
+static void debugtrace_dump_worker(void)
+{
     if ( (debugtrace_bytes == 0) || !debugtrace_used )
         return;
-
-    watchdog_disable();
-
-    spin_lock_irqsave(&debugtrace_lock, flags);
 
     printk("debugtrace_dump() starting\n");
 
@@ -602,15 +596,47 @@ void debugtrace_dump(void)
     memset(debugtrace_buf, '\0', debugtrace_bytes);
 
     printk("debugtrace_dump() finished\n");
+}
+
+void debugtrace_toggle(void)
+{
+    unsigned long flags;
+
+    watchdog_disable();
+    spin_lock_irqsave(&debugtrace_lock, flags);
+
+    // dump the buffer *before* toggling, in case the act of dumping the
+    // buffer itself causes more printk's...
+    //
+    printk("debugtrace_printk now writing to %s.\n",
+           !debugtrace_send_to_console ? "console": "buffer");
+    if ( !debugtrace_send_to_console )
+        debugtrace_dump_worker();
+
+    debugtrace_send_to_console = !debugtrace_send_to_console;
 
     spin_unlock_irqrestore(&debugtrace_lock, flags);
-
     watchdog_enable();
+
+}
+
+void debugtrace_dump(void)
+{
+    unsigned long flags;
+
+    watchdog_disable();
+    spin_lock_irqsave(&debugtrace_lock, flags);
+
+    debugtrace_dump_worker();
+
+    spin_unlock_irqrestore(&debugtrace_lock, flags);
+    watchdog_enable();
 }
 
 void debugtrace_printk(const char *fmt, ...)
 {
     static char    buf[1024];
+    static u32 count;
 
     va_list       args;
     char         *p;
@@ -625,8 +651,10 @@ void debugtrace_printk(const char *fmt, 
 
     ASSERT(debugtrace_buf[debugtrace_bytes - 1] == 0);
 
+    sprintf(buf, "%u ", ++count);
+
     va_start(args, fmt);
-    (void)vsnprintf(buf, sizeof(buf), fmt, args);
+    (void)vsnprintf(buf + strlen(buf), sizeof(buf), fmt, args);
     va_end(args);
 
     if ( debugtrace_send_to_console )
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/bitops.h
--- a/xen/include/asm-x86/bitops.h      Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/bitops.h      Wed Aug 16 17:02:35 2006 +0100
@@ -75,6 +75,24 @@ static __inline__ void clear_bit(int nr,
                :"=m" (ADDR)
                :"dIr" (nr));
 }
+
+/**
+ * __clear_bit - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * Unlike clear_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static __inline__ void __clear_bit(int nr, volatile void * addr)
+{
+       __asm__(
+               "btrl %1,%0"
+               :"=m" (ADDR)
+               :"dIr" (nr));
+}
+
 #define smp_mb__before_clear_bit()     barrier()
 #define smp_mb__after_clear_bit()      barrier()
 
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/config.h
--- a/xen/include/asm-x86/config.h      Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/config.h      Wed Aug 16 17:02:35 2006 +0100
@@ -79,9 +79,14 @@
 
 #ifndef __ASSEMBLY__
 extern unsigned long _end; /* standard ELF symbol */
+
+static inline void FORCE_CRASH(void) __attribute__((noreturn,always_inline)); 
+static inline void FORCE_CRASH(void) 
+{
+    __asm__ __volatile__ ( "ud2" );
+    while(1);
+}
 #endif /* __ASSEMBLY__ */
-
-#define FORCE_CRASH() __asm__ __volatile__ ( "ud2" )
 
 #if defined(__x86_64__)
 
@@ -149,9 +154,14 @@ extern unsigned long _end; /* standard E
 /* Slot 256: read-only guest-accessible machine-to-phys translation table. */
 #define RO_MPT_VIRT_START       (PML4_ADDR(256))
 #define RO_MPT_VIRT_END         (RO_MPT_VIRT_START + PML4_ENTRY_BYTES/2)
+
+// current unused?
+#if 0
 /* Slot 257: read-only guest-accessible linear page table. */
 #define RO_LINEAR_PT_VIRT_START (PML4_ADDR(257))
 #define RO_LINEAR_PT_VIRT_END   (RO_LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES)
+#endif
+
 /* Slot 258: linear page table (guest table). */
 #define LINEAR_PT_VIRT_START    (PML4_ADDR(258))
 #define LINEAR_PT_VIRT_END      (LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES)
@@ -175,7 +185,7 @@ extern unsigned long _end; /* standard E
 #define DIRECTMAP_VIRT_START    (PML4_ADDR(262))
 #define DIRECTMAP_VIRT_END      (DIRECTMAP_VIRT_START + PML4_ENTRY_BYTES*2)
 
-#define PGT_base_page_table PGT_l4_page_table
+#define PGT_base_page_table     PGT_l4_page_table
 
 #define __HYPERVISOR_CS64 0xe010
 #define __HYPERVISOR_CS32 0xe008
@@ -274,9 +284,9 @@ extern unsigned long _end; /* standard E
     (L2_PAGETABLE_LAST_XEN_SLOT - L2_PAGETABLE_FIRST_XEN_SLOT + 1)
 
 #ifdef CONFIG_X86_PAE
-# define PGT_base_page_table PGT_l3_page_table
-#else
-# define PGT_base_page_table PGT_l2_page_table
+# define PGT_base_page_table     PGT_l3_page_table
+#else
+# define PGT_base_page_table     PGT_l2_page_table
 #endif
 
 #define __HYPERVISOR_CS 0xe008
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h      Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/domain.h      Wed Aug 16 17:02:35 2006 +0100
@@ -73,42 +73,42 @@ struct arch_domain
     /* I/O-port admin-specified access capabilities. */
     struct rangeset *ioport_caps;
 
-    /* Shadow mode status and controls. */
-    struct shadow_ops *ops;
-    unsigned int shadow_mode;  /* flags to control shadow table operation */
-    unsigned int shadow_nest;  /* Recursive depth of shadow_lock() nesting */
-
-    /* shadow hashtable */
-    struct shadow_status *shadow_ht;
-    struct shadow_status *shadow_ht_free;
-    struct shadow_status *shadow_ht_extras; /* extra allocation units */
-    unsigned int shadow_extras_count;
-
-    /* shadow dirty bitmap */
+    /* HVM stuff */
+    struct hvm_domain   hvm_domain;
+
+    /* Shadow-translated guest: Pseudophys base address of reserved area. */
+    unsigned long first_reserved_pfn;
+
+    /* Shadow2 stuff */
+    u32               shadow2_mode;  /* flags to control shadow operation */
+    spinlock_t        shadow2_lock;  /* shadow2 domain lock */
+    int               shadow2_locker; /* processor which holds the lock */
+    const char       *shadow2_locker_function; /* Func that took it */
+    struct list_head  shadow2_freelists[SHADOW2_MAX_ORDER + 1]; 
+    struct list_head  shadow2_p2m_freelist;
+    struct list_head  shadow2_p2m_inuse;
+    struct list_head  shadow2_toplevel_shadows;
+    unsigned int      shadow2_total_pages;  /* number of pages allocated */
+    unsigned int      shadow2_free_pages;   /* number of pages on freelists */
+    unsigned int      shadow2_p2m_pages;    /* number of pages in p2m map */
+
+    /* Shadow2 hashtable */
+    struct shadow2_hash_entry *shadow2_hash_table;
+    struct shadow2_hash_entry *shadow2_hash_freelist;
+    struct shadow2_hash_entry *shadow2_hash_allocations;
+    int shadow2_hash_walking;  /* Some function is walking the hash table */
+
+    /* Shadow log-dirty bitmap */
     unsigned long *shadow_dirty_bitmap;
     unsigned int shadow_dirty_bitmap_size;  /* in pages, bit per page */
 
-    /* shadow mode stats */
-    unsigned int shadow_page_count;
-    unsigned int hl2_page_count;
-    unsigned int snapshot_page_count;
-
+    /* Shadow log-dirty mode stats */
     unsigned int shadow_fault_count;
     unsigned int shadow_dirty_count;
 
-    /* full shadow mode */
-    struct out_of_sync_entry *out_of_sync; /* list of out-of-sync pages */
-    struct out_of_sync_entry *out_of_sync_free;
-    struct out_of_sync_entry *out_of_sync_extras;
-    unsigned int out_of_sync_extras_count;
-
-    struct list_head free_shadow_frames;
-
-    pagetable_t         phys_table;         /* guest 1:1 pagetable */
-    struct hvm_domain   hvm_domain;
-
-    /* Shadow-translated guest: Pseudophys base address of reserved area. */
-    unsigned long first_reserved_pfn;
+    /* Shadow translated domain: P2M mapping */
+    pagetable_t phys_table;
+
 } __cacheline_aligned;
 
 #ifdef CONFIG_X86_PAE
@@ -166,25 +166,34 @@ struct arch_vcpu
      */
     l1_pgentry_t *perdomain_ptes;
 
-    pagetable_t  guest_table_user;      /* x86/64: user-space pagetable. */
-    pagetable_t  guest_table;           /* (MA) guest notion of cr3 */
-    pagetable_t  shadow_table;          /* (MA) shadow of guest */
-    pagetable_t  monitor_table;         /* (MA) used in hypervisor */
-
-    l2_pgentry_t *guest_vtable;         /* virtual address of pagetable */
-    l2_pgentry_t *shadow_vtable;        /* virtual address of shadow_table */
-    l2_pgentry_t *monitor_vtable;              /* virtual address of 
monitor_table */
-    l1_pgentry_t *hl2_vtable;                  /* virtual address of hl2_table 
*/
-
 #ifdef CONFIG_X86_64
-    l3_pgentry_t *guest_vl3table;
-    l4_pgentry_t *guest_vl4table;
-#endif
-
-    unsigned long monitor_shadow_ref;
+    pagetable_t guest_table_user;       /* (MFN) x86/64 user-space pagetable */
+#endif
+    pagetable_t guest_table;            /* (MFN) guest notion of cr3 */
+    /* guest_table holds a ref to the page, and also a type-count unless
+     * shadow refcounts are in use */
+    pagetable_t shadow_table;           /* (MFN) shadow of guest */
+    pagetable_t monitor_table;          /* (MFN) hypervisor PT (for HVM) */
+    unsigned long cr3;                     /* (MA) value to install in HW CR3 
*/
+
+    void *guest_vtable;                 /* virtual address of pagetable */
+    void *shadow_vtable;                /* virtual address of shadow_table */
+    root_pgentry_t *monitor_vtable;            /* virtual address of 
monitor_table */
 
     /* Current LDT details. */
     unsigned long shadow_ldt_mapcnt;
+
+    /* Shadow2 stuff */
+    /* -- pointers to mode-specific entry points */
+    struct shadow2_entry_points *shadow2; 
+    unsigned long last_emulated_mfn;    /* last mfn we emulated a write to */
+    u8 shadow2_propagate_fault;         /* emulated fault needs to be */
+                                        /* propagated to guest */
+#if CONFIG_PAGING_LEVELS >= 3
+    u8 shadow2_pae_flip_pending;        /* shadow update requires this PAE cpu
+                                         * to recopy/install its L3 table.
+                                         */
+#endif
 } __cacheline_aligned;
 
 /* shorthands to improve code legibility */
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/grant_table.h
--- a/xen/include/asm-x86/grant_table.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/grant_table.h Wed Aug 16 17:02:35 2006 +0100
@@ -31,7 +31,7 @@ int destroy_grant_host_mapping(
 #define gnttab_shared_gmfn(d, t, i)                     \
     (mfn_to_gmfn(d, gnttab_shared_mfn(d, t, i)))
 
-#define gnttab_log_dirty(d, f) mark_dirty((d), (f))
+#define gnttab_mark_dirty(d, f) mark_dirty((d), (f))
 
 static inline void gnttab_clear_flag(unsigned long nr, uint16_t *addr)
 {
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h     Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/hvm/hvm.h     Wed Aug 16 17:02:35 2006 +0100
@@ -56,8 +56,15 @@ struct hvm_function_table {
      */
     int (*realmode)(struct vcpu *v);
     int (*paging_enabled)(struct vcpu *v);
+    int (*long_mode_enabled)(struct vcpu *v);
+    int (*guest_x86_mode)(struct vcpu *v);
     int (*instruction_length)(struct vcpu *v);
     unsigned long (*get_guest_ctrl_reg)(struct vcpu *v, unsigned int num);
+
+    /* 
+     * Re-set the value of CR3 that Xen runs on when handling VM exits
+     */
+    void (*update_host_cr3)(struct vcpu *v);
 
     /*
      * Update specifics of the guest state:
@@ -134,9 +141,27 @@ hvm_paging_enabled(struct vcpu *v)
 }
 
 static inline int
+hvm_long_mode_enabled(struct vcpu *v)
+{
+    return hvm_funcs.long_mode_enabled(v);
+}
+
+static inline int
+hvm_guest_x86_mode(struct vcpu *v)
+{
+    return hvm_funcs.guest_x86_mode(v);
+}
+
+static inline int
 hvm_instruction_length(struct vcpu *v)
 {
     return hvm_funcs.instruction_length(v);
+}
+
+static inline void
+hvm_update_host_cr3(struct vcpu *v)
+{
+    hvm_funcs.update_host_cr3(v);
 }
 
 void hvm_hypercall_page_initialise(struct domain *d,
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/hvm/support.h
--- a/xen/include/asm-x86/hvm/support.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/hvm/support.h Wed Aug 16 17:02:35 2006 +0100
@@ -116,10 +116,13 @@ enum hval_bitmaps {
 #define DBG_LEVEL_IOAPIC            (1 << 9)
 
 extern unsigned int opt_hvm_debug_level;
-#define HVM_DBG_LOG(level, _f, _a...)           \
-    if ( (level) & opt_hvm_debug_level )        \
-        printk("[HVM:%d.%d] <%s> " _f "\n",     \
-               current->domain->domain_id, current->vcpu_id, __func__, ## _a)
+#define HVM_DBG_LOG(level, _f, _a...)                                         \
+    do {                                                                      \
+        if ( (level) & opt_hvm_debug_level )                                  \
+            printk("[HVM:%d.%d] <%s> " _f "\n",                               \
+                   current->domain->domain_id, current->vcpu_id, __func__,    \
+                   ## _a);                                                    \
+    } while (0)
 #else
 #define HVM_DBG_LOG(level, _f, _a...)
 #endif
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/hvm/vcpu.h
--- a/xen/include/asm-x86/hvm/vcpu.h    Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/hvm/vcpu.h    Wed Aug 16 17:02:35 2006 +0100
@@ -29,6 +29,7 @@
 #define HVM_VCPU_INIT_SIPI_SIPI_STATE_WAIT_SIPI     1
 
 struct hvm_vcpu {
+    unsigned long       hw_cr3;     /* value we give to HW to use */
     unsigned long       ioflags;
     struct hvm_io_op    io_op;
     struct vlapic       *vlapic;
@@ -39,6 +40,11 @@ struct hvm_vcpu {
     unsigned long       init_sipi_sipi_state;
 
     int                 xen_port;
+
+#if CONFIG_PAGING_LEVELS >= 3
+    l3_pgentry_t hvm_lowmem_l3tab[4]
+    __attribute__((__aligned__(32)));
+#endif
 
     /* Flags */
     int                 flag_dr_dirty;
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/hvm/vmx/vmcs.h
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h        Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h        Wed Aug 16 17:02:35 2006 +0100
@@ -87,6 +87,7 @@ struct arch_vmx_struct {
 
     unsigned long        cpu_cr0; /* copy of guest CR0 */
     unsigned long        cpu_shadow_cr0; /* copy of guest read shadow CR0 */
+    unsigned long        cpu_shadow_cr4; /* copy of guest read shadow CR4 */
     unsigned long        cpu_cr2; /* save CR2 */
     unsigned long        cpu_cr3;
     unsigned long        cpu_state;
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/hvm/vmx/vmx.h
--- a/xen/include/asm-x86/hvm/vmx/vmx.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h Wed Aug 16 17:02:35 2006 +0100
@@ -298,6 +298,9 @@ static always_inline void __vmwrite_vcpu
     case GUEST_CR0:
         v->arch.hvm_vmx.cpu_cr0 = value;
         break;
+    case CR4_READ_SHADOW:
+        v->arch.hvm_vmx.cpu_shadow_cr4 = value;
+        break;
     case CPU_BASED_VM_EXEC_CONTROL:
         v->arch.hvm_vmx.cpu_based_exec_control = value;
         break;
@@ -317,11 +320,14 @@ static always_inline void __vmread_vcpu(
     case GUEST_CR0:
         *value = v->arch.hvm_vmx.cpu_cr0;
         break;
+    case CR4_READ_SHADOW:
+        *value = v->arch.hvm_vmx.cpu_shadow_cr4;
+        break;
     case CPU_BASED_VM_EXEC_CONTROL:
         *value = v->arch.hvm_vmx.cpu_based_exec_control;
         break;
     default:
-        printk("__vmread_cpu: invalid field %lx\n", field);
+        printk("__vmread_vcpu: invalid field %lx\n", field);
         break;
     }
 }
@@ -342,6 +348,7 @@ static inline int __vmwrite(unsigned lon
     switch ( field ) {
     case CR0_READ_SHADOW:
     case GUEST_CR0:
+    case CR4_READ_SHADOW:
     case CPU_BASED_VM_EXEC_CONTROL:
         __vmwrite_vcpu(v, field, value);
         break;
@@ -402,6 +409,46 @@ static inline int vmx_paging_enabled(str
 
     __vmread_vcpu(v, CR0_READ_SHADOW, &cr0);
     return (cr0 & X86_CR0_PE) && (cr0 & X86_CR0_PG);
+}
+
+/* Works only for vcpu == current */
+static inline int vmx_long_mode_enabled(struct vcpu *v)
+{
+    ASSERT(v == current);
+    return VMX_LONG_GUEST(current);
+}
+
+/* Works only for vcpu == current */
+static inline int vmx_realmode(struct vcpu *v)
+{
+    unsigned long rflags;
+    ASSERT(v == current);
+
+    __vmread(GUEST_RFLAGS, &rflags);
+    return rflags & X86_EFLAGS_VM;
+}
+
+/* Works only for vcpu == current */
+static inline void vmx_update_host_cr3(struct vcpu *v)
+{
+    ASSERT(v == current);
+    __vmwrite(HOST_CR3, v->arch.cr3);
+}
+
+static inline int vmx_guest_x86_mode(struct vcpu *v)
+{
+    unsigned long cs_ar_bytes;
+    ASSERT(v == current);
+
+    if ( vmx_long_mode_enabled(v) )
+    {
+        __vmread(GUEST_CS_AR_BYTES, &cs_ar_bytes);
+        return (cs_ar_bytes & (1u<<13)) ? 8 : 4;
+    }
+    if ( vmx_realmode(v) )
+        return 2;
+    __vmread(GUEST_CS_AR_BYTES, &cs_ar_bytes);
+    return (cs_ar_bytes & (1u<<14)) ? 4 : 2;
 }
 
 static inline int vmx_pgbit_test(struct vcpu *v)
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h  Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/mm.h  Wed Aug 16 17:02:35 2006 +0100
@@ -20,7 +20,11 @@ struct page_info
 struct page_info
 {
     /* Each frame can be threaded onto a doubly-linked list. */
-    struct list_head list;
+    union {
+        struct list_head list;
+        /* Shadow2 uses this field as an up-pointer in lower-level shadows */
+        paddr_t up;
+    };
 
     /* Reference count and various PGC_xxx flags and fields. */
     u32 count_info;
@@ -46,8 +50,20 @@ struct page_info
 
     } u;
 
-    /* Timestamp from 'TLB clock', used to reduce need for safety flushes. */
-    u32 tlbflush_timestamp;
+    union {
+        /* Timestamp from 'TLB clock', used to reduce need for safety
+         * flushes.  Only valid on a) free pages, and b) guest pages with a
+         * zero type count. */
+        u32 tlbflush_timestamp;
+
+        /* Only used on guest pages with a shadow.
+         * Guest pages with a shadow must have a non-zero type count, so this
+         * does not conflict with the tlbflush timestamp. */
+        u32 shadow2_flags;
+
+        // XXX -- we expect to add another field here, to be used for min/max
+        // purposes, which is only used for shadow pages.
+    };
 };
 
  /* The following page types are MUTUALLY EXCLUSIVE. */
@@ -60,6 +76,7 @@ struct page_info
 #define PGT_ldt_page        (6U<<29) /* using this page in an LDT? */
 #define PGT_writable_page   (7U<<29) /* has writable mappings of this page? */
 
+#ifndef SHADOW2
 #define PGT_l1_shadow       PGT_l1_page_table
 #define PGT_l2_shadow       PGT_l2_page_table
 #define PGT_l3_shadow       PGT_l3_page_table
@@ -69,14 +86,16 @@ struct page_info
 #define PGT_writable_pred   (7U<<29) /* predicted gpfn with writable ref */
 
 #define PGT_fl1_shadow      (5U<<29)
+#endif
+
 #define PGT_type_mask       (7U<<29) /* Bits 29-31. */
 
+ /* Owning guest has pinned this page to its current type? */
+#define _PGT_pinned         28
+#define PGT_pinned          (1U<<_PGT_pinned)
  /* Has this page been validated for use as its current type? */
-#define _PGT_validated      28
+#define _PGT_validated      27
 #define PGT_validated       (1U<<_PGT_validated)
- /* Owning guest has pinned this page to its current type? */
-#define _PGT_pinned         27
-#define PGT_pinned          (1U<<_PGT_pinned)
 #if defined(__i386__)
  /* The 11 most significant bits of virt address if this is a page table. */
 #define PGT_va_shift        16
@@ -98,6 +117,7 @@ struct page_info
  /* 16-bit count of uses of this frame as its current type. */
 #define PGT_count_mask      ((1U<<16)-1)
 
+#ifndef SHADOW2
 #ifdef __x86_64__
 #define PGT_high_mfn_shift  52
 #define PGT_high_mfn_mask   (0xfffUL << PGT_high_mfn_shift)
@@ -112,19 +132,53 @@ struct page_info
 #define PGT_score_shift     23
 #define PGT_score_mask      (((1U<<4)-1)<<PGT_score_shift)
 #endif
+#endif /* SHADOW2 */
 
  /* Cleared when the owning guest 'frees' this page. */
 #define _PGC_allocated      31
 #define PGC_allocated       (1U<<_PGC_allocated)
- /* Set when fullshadow mode marks a page out-of-sync */
+ /* Set on a *guest* page to mark it out-of-sync with its shadow */
 #define _PGC_out_of_sync     30
 #define PGC_out_of_sync     (1U<<_PGC_out_of_sync)
- /* Set when fullshadow mode is using a page as a page table */
+ /* Set when is using a page as a page table */
 #define _PGC_page_table      29
 #define PGC_page_table      (1U<<_PGC_page_table)
  /* 29-bit count of references to this frame. */
 #define PGC_count_mask      ((1U<<29)-1)
 
+/* shadow2 uses the count_info on shadow pages somewhat differently */
+/* NB: please coordinate any changes here with the SH2F's in shadow2.h */
+#define PGC_SH2_none           (0U<<28) /* on the shadow2 free list */
+#define PGC_SH2_min_shadow     (1U<<28)
+#define PGC_SH2_l1_32_shadow   (1U<<28) /* shadowing a 32-bit L1 guest page */
+#define PGC_SH2_fl1_32_shadow  (2U<<28) /* L1 shadow for a 32b 4M superpage */
+#define PGC_SH2_l2_32_shadow   (3U<<28) /* shadowing a 32-bit L2 guest page */
+#define PGC_SH2_l1_pae_shadow  (4U<<28) /* shadowing a pae L1 page */
+#define PGC_SH2_fl1_pae_shadow (5U<<28) /* L1 shadow for pae 2M superpg */
+#define PGC_SH2_l2_pae_shadow  (6U<<28) /* shadowing a pae L2-low page */
+#define PGC_SH2_l2h_pae_shadow (7U<<28) /* shadowing a pae L2-high page */
+#define PGC_SH2_l3_pae_shadow  (8U<<28) /* shadowing a pae L3 page */
+#define PGC_SH2_l1_64_shadow   (9U<<28) /* shadowing a 64-bit L1 page */
+#define PGC_SH2_fl1_64_shadow (10U<<28) /* L1 shadow for 64-bit 2M superpg */
+#define PGC_SH2_l2_64_shadow  (11U<<28) /* shadowing a 64-bit L2 page */
+#define PGC_SH2_l3_64_shadow  (12U<<28) /* shadowing a 64-bit L3 page */
+#define PGC_SH2_l4_64_shadow  (13U<<28) /* shadowing a 64-bit L4 page */
+#define PGC_SH2_max_shadow    (13U<<28)
+#define PGC_SH2_p2m_table     (14U<<28) /* in use as the p2m table */
+#define PGC_SH2_monitor_table (15U<<28) /* in use as a monitor table */
+#define PGC_SH2_unused        (15U<<28)
+
+#define PGC_SH2_type_mask     (15U<<28)
+#define PGC_SH2_type_shift          28
+
+#define PGC_SH2_pinned         (1U<<27)
+
+#define _PGC_SH2_log_dirty          26
+#define PGC_SH2_log_dirty      (1U<<26)
+
+/* 26 bit ref count for shadow pages */
+#define PGC_SH2_count_mask    ((1U<<26) - 1)
+
 /* We trust the slab allocator in slab.c, and our use of it. */
 #define PageSlab(page)     (1)
 #define PageSetSlab(page)   ((void)0)
@@ -134,14 +188,22 @@ struct page_info
 
 #if defined(__i386__)
 #define pickle_domptr(_d)   ((u32)(unsigned long)(_d))
-#define unpickle_domptr(_d) ((struct domain *)(unsigned long)(_d))
+static inline struct domain *unpickle_domptr(u32 _domain)
+{ return (_domain & 1) ? NULL : (void *)_domain; }
 #define PRtype_info "08lx" /* should only be used for printk's */
 #elif defined(__x86_64__)
 static inline struct domain *unpickle_domptr(u32 _domain)
-{ return (_domain == 0) ? NULL : __va(_domain); }
+{ return ((_domain == 0) || (_domain & 1)) ? NULL : __va(_domain); }
 static inline u32 pickle_domptr(struct domain *domain)
 { return (domain == NULL) ? 0 : (u32)__pa(domain); }
 #define PRtype_info "016lx"/* should only be used for printk's */
+#endif
+
+/* The order of the largest allocation unit we use for shadow pages */
+#if CONFIG_PAGING_LEVELS == 2
+#define SHADOW2_MAX_ORDER 0 /* Only ever need 4k allocations */
+#else  
+#define SHADOW2_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */
 #endif
 
 #define page_get_owner(_p)    (unpickle_domptr((_p)->u.inuse._domain))
@@ -165,7 +227,7 @@ extern int shadow_remove_all_write_acces
 extern int shadow_remove_all_write_access(
     struct domain *d, unsigned long gmfn, unsigned long mfn);
 extern u32 shadow_remove_all_access( struct domain *d, unsigned long gmfn);
-extern int _shadow_mode_refcounts(struct domain *d);
+extern int _shadow2_mode_refcounts(struct domain *d);
 
 static inline void put_page(struct page_info *page)
 {
@@ -197,8 +259,8 @@ static inline int get_page(struct page_i
              unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
              unlikely(d != _domain) )                /* Wrong owner? */
         {
-            if ( !_shadow_mode_refcounts(domain) )
-                DPRINTK("Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%"
+            if ( !_shadow2_mode_refcounts(domain) )
+                DPRINTK("Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%" 
                         PRtype_info "\n",
                         page_to_mfn(page), domain, unpickle_domptr(d),
                         x, page->u.inuse.type_info);
@@ -254,6 +316,16 @@ static inline int page_is_removable(stru
     ASSERT(((_p)->count_info & PGC_count_mask) != 0);          \
     ASSERT(page_get_owner(_p) == (_d))
 
+// Quick test for whether a given page can be represented directly in CR3.
+//
+#if CONFIG_PAGING_LEVELS == 3
+#define MFN_FITS_IN_CR3(_MFN) !(mfn_x(_MFN) >> 20)
+
+/* returns a lowmem machine address of the copied L3 root table */
+unsigned long
+pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab);
+#endif /* CONFIG_PAGING_LEVELS == 3 */
+
 int check_descriptor(struct desc_struct *d);
 
 /*
@@ -271,29 +343,44 @@ int check_descriptor(struct desc_struct 
 #define set_gpfn_from_mfn(mfn, pfn) (machine_to_phys_mapping[(mfn)] = (pfn))
 #define get_gpfn_from_mfn(mfn)      (machine_to_phys_mapping[(mfn)])
 
+
+#define mfn_to_gmfn(_d, mfn)                            \
+    ( (shadow2_mode_translate(_d))                      \
+      ? get_gpfn_from_mfn(mfn)                          \
+      : (mfn) )
+
+#define gmfn_to_mfn(_d, gpfn)  mfn_x(sh2_gfn_to_mfn(_d, gpfn))
+
+
 /*
  * The phys_to_machine_mapping is the reversed mapping of MPT for full
  * virtualization.  It is only used by shadow_mode_translate()==true
  * guests, so we steal the address space that would have normally
  * been used by the read-only MPT map.
  */
-#define phys_to_machine_mapping ((unsigned long *)RO_MPT_VIRT_START)
-#define NR_P2M_TABLE_ENTRIES    ((unsigned long *)RO_MPT_VIRT_END \
-                                 - phys_to_machine_mapping)
+#define phys_to_machine_mapping ((l1_pgentry_t *)RO_MPT_VIRT_START)
 #define INVALID_MFN             (~0UL)
 #define VALID_MFN(_mfn)         (!((_mfn) & (1U<<31)))
 
-#define set_mfn_from_gpfn(pfn, mfn) (phys_to_machine_mapping[(pfn)] = (mfn))
 static inline unsigned long get_mfn_from_gpfn(unsigned long pfn)
 {
-    unsigned long mfn;
-
-    if ( unlikely(pfn >= NR_P2M_TABLE_ENTRIES) ||
-         unlikely(__copy_from_user(&mfn, &phys_to_machine_mapping[pfn],
-                                   sizeof(mfn))) )
-       mfn = INVALID_MFN;
-
-    return mfn;
+    l1_pgentry_t l1e = l1e_empty();
+    int ret;
+
+#if CONFIG_PAGING_LEVELS > 2
+    if ( pfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof (l1_pgentry_t) ) 
+        /* This pfn is higher than the p2m map can hold */
+        return INVALID_MFN;
+#endif
+
+    ret = __copy_from_user(&l1e,
+                               &phys_to_machine_mapping[pfn],
+                               sizeof(l1e));
+
+    if ( (ret == 0) && (l1e_get_flags(l1e) & _PAGE_PRESENT) )
+        return l1e_get_pfn(l1e);
+
+    return INVALID_MFN;
 }
 
 #ifdef MEMORY_GUARD
@@ -333,6 +420,7 @@ void audit_domains(void);
 #endif
 
 int new_guest_cr3(unsigned long pfn);
+void make_cr3(struct vcpu *v, unsigned long mfn);
 
 void propagate_page_fault(unsigned long addr, u16 error_code);
 
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/msr.h
--- a/xen/include/asm-x86/msr.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/msr.h Wed Aug 16 17:02:35 2006 +0100
@@ -112,6 +112,10 @@ static inline void wrmsrl(unsigned int m
 #define MSR_IA32_VMX_EXIT_CTLS_MSR              0x483
 #define MSR_IA32_VMX_ENTRY_CTLS_MSR             0x484
 #define MSR_IA32_VMX_MISC_MSR                   0x485
+#define MSR_IA32_VMX_CR0_FIXED0                 0x486
+#define MSR_IA32_VMX_CR0_FIXED1                 0x487
+#define MSR_IA32_VMX_CR4_FIXED0                 0x488
+#define MSR_IA32_VMX_CR4_FIXED1                 0x489
 #define IA32_FEATURE_CONTROL_MSR                0x3a
 #define IA32_FEATURE_CONTROL_MSR_LOCK           0x1
 #define IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON   0x4
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/page-guest32.h
--- a/xen/include/asm-x86/page-guest32.h        Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/page-guest32.h        Wed Aug 16 17:02:35 2006 +0100
@@ -89,15 +89,8 @@ static inline l2_pgentry_32_t l2e_from_p
 
 #define linear_l1_table_32                                                 \
     ((l1_pgentry_32_t *)(LINEAR_PT_VIRT_START))
-#define __linear_l2_table_32                                                 \
-    ((l2_pgentry_32_t *)(LINEAR_PT_VIRT_START +                            \
-                     (LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<0))))
 
 #define linear_pg_table_32 linear_l1_table_32
-#define linear_l2_table_32(_ed) ((_ed)->arch.guest_vtable)
-
-#define va_to_l1mfn_32(_ed, _va) \
-    (l2e_get_pfn(linear_l2_table(_ed)[_va>>L2_PAGETABLE_SHIFT]))
 
 #endif /* __X86_PAGE_GUEST_H__ */
 
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/page.h
--- a/xen/include/asm-x86/page.h        Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/page.h        Wed Aug 16 17:02:35 2006 +0100
@@ -233,26 +233,18 @@ typedef struct { u64 pfn; } pagetable_t;
      + DOMAIN_ENTRIES_PER_L4_PAGETABLE)
 #endif
 
-#define LINEAR_PT_OFFSET (LINEAR_PT_VIRT_START & VADDR_MASK)
-#define linear_l1_table                                             \
-    ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
-#define __linear_l2_table                                           \
-    ((l2_pgentry_t *)(LINEAR_PT_VIRT_START +                        \
-                     (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0))))
-#define __linear_l3_table                                           \
-    ((l3_pgentry_t *)(LINEAR_PT_VIRT_START +                        \
-                     (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0)) +   \
-                     (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<1))))
-#define __linear_l4_table                                           \
-    ((l4_pgentry_t *)(LINEAR_PT_VIRT_START +                        \
-                     (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0)) +   \
-                     (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<1)) +   \
-                     (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<2))))
-
+/* Where to find each level of the linear mapping */
+#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
+#define __linear_l2_table \
+ ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l3_table \
+ ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l4_table \
+ ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
+
+#define linear_l1_table __linear_l1_table
 #define linear_pg_table linear_l1_table
-#define linear_l2_table(v) ((v)->arch.guest_vtable)
-#define linear_l3_table(v) ((v)->arch.guest_vl3table)
-#define linear_l4_table(v) ((v)->arch.guest_vl4table)
+#define linear_l2_table(v) ((l2_pgentry_t *)(v)->arch.guest_vtable)
 
 #ifndef __ASSEMBLY__
 #if CONFIG_PAGING_LEVELS == 3
@@ -294,6 +286,7 @@ extern void paging_init(void);
 #define _PAGE_AVAIL1   0x400U
 #define _PAGE_AVAIL2   0x800U
 #define _PAGE_AVAIL    0xE00U
+#define _PAGE_PSE_PAT 0x1000U
 
 /*
  * Debug option: Ensure that granted mappings are not implicitly unmapped.
@@ -307,9 +300,9 @@ extern void paging_init(void);
 #endif
 
 /*
- * Disallow unused flag bits plus PAT, PSE and GLOBAL. Also disallow GNTTAB
- * if we are using it for grant-table debugging. Permit the NX bit if the
- * hardware supports it.
+ * Disallow unused flag bits plus PAT, PSE and GLOBAL.
+ * Also disallow GNTTAB if we are using it for grant-table debugging.
+ * Permit the NX bit if the hardware supports it.
  */
 #define BASE_DISALLOW_MASK ((0xFFFFF180U | _PAGE_GNTTAB) & ~_PAGE_NX)
 
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/perfc_defn.h
--- a/xen/include/asm-x86/perfc_defn.h  Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/perfc_defn.h  Wed Aug 16 17:02:35 2006 +0100
@@ -144,4 +144,57 @@ PERFCOUNTER_CPU(remove_write_bad_predict
 PERFCOUNTER_CPU(remove_write_bad_prediction, "remove_write bad prediction")
 PERFCOUNTER_CPU(update_hl2e_invlpg,     "update_hl2e calls invlpg")
 
+/* Shadow2 counters */
+PERFCOUNTER_CPU(shadow2_alloc,          "calls to shadow2_alloc")
+PERFCOUNTER_CPU(shadow2_alloc_tlbflush, "shadow2_alloc flushed TLBs")
+PERFSTATUS(shadow2_alloc_count,         "number of shadow pages in use")
+PERFCOUNTER_CPU(shadow2_free,           "calls to shadow2_free")
+PERFCOUNTER_CPU(shadow2_prealloc_1,     "shadow2 recycles old shadows")
+PERFCOUNTER_CPU(shadow2_prealloc_2,     "shadow2 recycles in-use shadows")
+PERFCOUNTER_CPU(shadow2_linear_map_failed, "shadow2 hit read-only linear map")
+PERFCOUNTER_CPU(shadow2_a_update,       "shadow2 A bit update")
+PERFCOUNTER_CPU(shadow2_ad_update,      "shadow2 A&D bit update")
+PERFCOUNTER_CPU(shadow2_fault,          "calls to shadow2_fault")
+PERFCOUNTER_CPU(shadow2_fault_bail_bad_gfn, "shadow2_fault guest bad gfn")
+PERFCOUNTER_CPU(shadow2_fault_bail_not_present, 
+                                        "shadow2_fault guest not-present")
+PERFCOUNTER_CPU(shadow2_fault_bail_nx,  "shadow2_fault guest NX fault")
+PERFCOUNTER_CPU(shadow2_fault_bail_ro_mapping, "shadow2_fault guest R/W fault")
+PERFCOUNTER_CPU(shadow2_fault_bail_user_supervisor, 
+                                        "shadow2_fault guest U/S fault")
+PERFCOUNTER_CPU(shadow2_fault_emulate_read, "shadow2_fault emulates a read")
+PERFCOUNTER_CPU(shadow2_fault_emulate_write, "shadow2_fault emulates a write")
+PERFCOUNTER_CPU(shadow2_fault_emulate_failed, "shadow2_fault emulator fails")
+PERFCOUNTER_CPU(shadow2_fault_mmio,     "shadow2_fault handled as mmio")
+PERFCOUNTER_CPU(shadow2_fault_fixed,    "shadow2_fault fixed fault")
+PERFCOUNTER_CPU(shadow2_ptwr_emulate,   "shadow2 causes ptwr to emulate")
+PERFCOUNTER_CPU(shadow2_validate_gl1e_calls, "calls to shadow2_validate_gl1e")
+PERFCOUNTER_CPU(shadow2_validate_gl2e_calls, "calls to shadow2_validate_gl2e")
+PERFCOUNTER_CPU(shadow2_validate_gl3e_calls, "calls to shadow2_validate_gl3e")
+PERFCOUNTER_CPU(shadow2_validate_gl4e_calls, "calls to shadow2_validate_gl4e")
+PERFCOUNTER_CPU(shadow2_hash_lookups,   "calls to shadow2_hash_lookup")
+PERFCOUNTER_CPU(shadow2_hash_lookup_head, "shadow2 hash hit in bucket head")
+PERFCOUNTER_CPU(shadow2_hash_lookup_miss, "shadow2 hash misses")
+PERFCOUNTER_CPU(shadow2_get_shadow_status, "calls to get_shadow_status")
+PERFCOUNTER_CPU(shadow2_hash_inserts,   "calls to shadow2_hash_insert")
+PERFCOUNTER_CPU(shadow2_hash_deletes,   "calls to shadow2_hash_delete")
+PERFCOUNTER_CPU(shadow2_writeable,      "shadow2 removes write access")
+PERFCOUNTER_CPU(shadow2_writeable_h_1,  "shadow2 writeable: 32b w2k3")
+PERFCOUNTER_CPU(shadow2_writeable_h_2,  "shadow2 writeable: 32pae w2k3")
+PERFCOUNTER_CPU(shadow2_writeable_h_3,  "shadow2 writeable: 64b w2k3")
+PERFCOUNTER_CPU(shadow2_writeable_h_4,  "shadow2 writeable: 32b linux low")
+PERFCOUNTER_CPU(shadow2_writeable_bf,   "shadow2 writeable brute-force")
+PERFCOUNTER_CPU(shadow2_mappings,       "shadow2 removes all mappings")
+PERFCOUNTER_CPU(shadow2_mappings_bf,    "shadow2 rm-mappings brute-force")
+PERFCOUNTER_CPU(shadow2_early_unshadow, "shadow2 unshadows for fork/exit")
+PERFCOUNTER_CPU(shadow2_early_unshadow_top, "shadow2 unhooks for fork/exit")
+PERFCOUNTER_CPU(shadow2_unshadow,       "shadow2 unshadows a page")
+PERFCOUNTER_CPU(shadow2_up_pointer,     "shadow2 unshadow by up-pointer")
+PERFCOUNTER_CPU(shadow2_unshadow_bf,    "shadow2 unshadow brute-force")
+PERFCOUNTER_CPU(shadow2_get_page_fail,  "shadow2_get_page_from_l1e failed")
+PERFCOUNTER_CPU(shadow2_guest_walk,     "shadow2 walks guest tables")
+PERFCOUNTER_CPU(shadow2_walk_cache_hit, "shadow2 walk-cache hits")
+PERFCOUNTER_CPU(shadow2_walk_cache_miss, "shadow2 walk-cache misses")
+
+
 /*#endif*/ /* __XEN_PERFC_DEFN_H__ */
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/processor.h
--- a/xen/include/asm-x86/processor.h   Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/processor.h   Wed Aug 16 17:02:35 2006 +0100
@@ -545,6 +545,7 @@ extern always_inline void prefetchw(cons
 #endif
 
 void show_stack(struct cpu_user_regs *regs);
+void show_xen_trace(void);
 void show_stack_overflow(unsigned long esp);
 void show_registers(struct cpu_user_regs *regs);
 void show_execution_state(struct cpu_user_regs *regs);
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/shadow.h
--- a/xen/include/asm-x86/shadow.h      Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/shadow.h      Wed Aug 16 17:02:35 2006 +0100
@@ -1,8 +1,7 @@
 /******************************************************************************
  * include/asm-x86/shadow.h
  * 
- * Copyright (c) 2005 Michael A Fetterman
- * Based on an earlier implementation by Ian Pratt et al
+ * Copyright (c) 2006 by XenSource Inc.
  * 
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -22,1782 +21,28 @@
 #ifndef _XEN_SHADOW_H
 #define _XEN_SHADOW_H
 
-#include <xen/config.h>
-#include <xen/types.h>
-#include <xen/perfc.h>
-#include <xen/sched.h>
-#include <xen/mm.h>
-#include <xen/domain_page.h>
-#include <asm/current.h>
-#include <asm/flushtlb.h>
-#include <asm/processor.h>
-#include <asm/hvm/hvm.h>
-#include <asm/hvm/support.h>
-#include <asm/regs.h>
-#include <public/dom0_ops.h>
-#include <asm/shadow_public.h>
-#include <asm/page-guest32.h>
-#include <asm/shadow_ops.h>
+/* This file is just a wrapper around the new Shadow2 header,
+ * providing names that must be defined in any shadow implementation. */
 
-/* Shadow PT operation mode : shadow-mode variable in arch_domain. */
+#include <asm/shadow2.h>
 
-#define SHM_enable    (1<<0) /* we're in one of the shadow modes */
-#define SHM_refcounts (1<<1) /* refcounts based on shadow tables instead of
-                                guest tables */
-#define SHM_write_all (1<<2) /* allow write access to all guest pt pages,
-                                regardless of pte write permissions */
-#define SHM_log_dirty (1<<3) /* enable log dirty mode */
-#define SHM_translate (1<<4) /* Xen does p2m translation, not guest */
-#define SHM_external  (1<<5) /* Xen does not steal address space from the
-                                domain for its own booking; requires VT or
-                                similar mechanisms */
-#define SHM_wr_pt_pte (1<<6) /* guest allowed to set PAGE_RW bit in PTEs which
-                                point to page table pages. */
+/* How to make sure a page is not referred to in a shadow PT */
+/* This will need to be a for_each_vcpu if we go to per-vcpu shadows */ 
+#define shadow_drop_references(_d, _p)                      \
+    shadow2_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p)))
+#define shadow_sync_and_drop_references(_d, _p)             \
+    shadow2_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p)))
 
-#define shadow_mode_enabled(_d)   ((_d)->arch.shadow_mode)
-#define shadow_mode_refcounts(_d) ((_d)->arch.shadow_mode & SHM_refcounts)
-#define shadow_mode_write_l1(_d)  (VM_ASSIST(_d, 
VMASST_TYPE_writable_pagetables))
-#define shadow_mode_write_all(_d) ((_d)->arch.shadow_mode & SHM_write_all)
-#define shadow_mode_log_dirty(_d) ((_d)->arch.shadow_mode & SHM_log_dirty)
-#define shadow_mode_translate(_d) ((_d)->arch.shadow_mode & SHM_translate)
-#define shadow_mode_external(_d)  ((_d)->arch.shadow_mode & SHM_external)
-#define shadow_mode_wr_pt_pte(_d) ((_d)->arch.shadow_mode & SHM_wr_pt_pte)
+/* Whether we are translating the domain's frame numbers for it */
+#define shadow_mode_translate(d)  shadow2_mode_translate(d)
 
-#define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START)
-#define __shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START + \
-     (SH_LINEAR_PT_VIRT_START >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT))))
-#define shadow_linear_l2_table(_v) ((_v)->arch.shadow_vtable)
+/* ...and  if so, how to add and remove entries in the mapping */
+#define guest_physmap_add_page(_d, _p, _m)                  \
+    shadow2_guest_physmap_add_page((_d), (_p), (_m))
+#define guest_physmap_remove_page(_d, _p, _m   )            \
+    shadow2_guest_physmap_remove_page((_d), (_p), (_m))
 
-// easy access to the hl2 table (for translated but not external modes only)
-#define __linear_hl2_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START + \
-     (PERDOMAIN_VIRT_START >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT))))
-
-/*
- * For now we use the per-domain BIGLOCK rather than a shadow-specific lock.
- * We usually have the BIGLOCK already acquired anyway, so this is unlikely
- * to cause much unnecessary extra serialisation. Also it's a recursive
- * lock, and there are some code paths containing nested shadow_lock().
- * The #if0'ed code below is therefore broken until such nesting is removed.
- */
-#if 0
-#define shadow_lock_init(_d)                    \
-    spin_lock_init(&(_d)->arch.shadow_lock)
-#define shadow_lock_is_acquired(_d)             \
-    spin_is_locked(&(_d)->arch.shadow_lock)
-#define shadow_lock(_d)                         \
-do {                                            \
-    ASSERT(!shadow_lock_is_acquired(_d));       \
-    spin_lock(&(_d)->arch.shadow_lock);         \
-} while (0)
-#define shadow_unlock(_d)                       \
-do {                                            \
-    ASSERT(!shadow_lock_is_acquired(_d));       \
-    spin_unlock(&(_d)->arch.shadow_lock);       \
-} while (0)
-#else
-#define shadow_lock_init(_d)                    \
-    ((_d)->arch.shadow_nest = 0)
-#define shadow_lock_is_acquired(_d)             \
-    (spin_is_locked(&(_d)->big_lock) && ((_d)->arch.shadow_nest != 0))
-#define shadow_lock(_d)                         \
-do {                                            \
-    LOCK_BIGLOCK(_d);                           \
-    (_d)->arch.shadow_nest++;                   \
-} while (0)
-#define shadow_unlock(_d)                       \
-do {                                            \
-    ASSERT(shadow_lock_is_acquired(_d));        \
-    (_d)->arch.shadow_nest--;                   \
-    UNLOCK_BIGLOCK(_d);                         \
-} while (0)
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 3
-static inline u64 get_cr3_idxval(struct vcpu *v)
-{
-    u64 pae_cr3;
-
-    if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 &&
-            !shadow_mode_log_dirty(v->domain) )
-    {
-        pae_cr3 = hvm_get_guest_ctrl_reg(v, 3); /* get CR3 */
-        return (pae_cr3 >> PAE_CR3_ALIGN) & PAE_CR3_IDX_MASK;
-    }
-    else
-        return 0;
-}
-
-#define shadow_key_t u64
-#define index_to_key(x) ((x) << 32)
-#else
-#define get_cr3_idxval(v) (0)
-#define shadow_key_t unsigned long
-#define index_to_key(x)  (0)
-#endif
-
-
-#define SHADOW_ENCODE_MIN_MAX(_min, _max) ((((GUEST_L1_PAGETABLE_ENTRIES - 1) 
- (_max)) << 16) | (_min))
-#define SHADOW_MIN(_encoded) ((_encoded) & ((1u<<16) - 1))
-#define SHADOW_MAX(_encoded) ((GUEST_L1_PAGETABLE_ENTRIES - 1) - ((_encoded) 
>> 16))
-extern void shadow_direct_map_clean(struct domain *d);
-extern int shadow_direct_map_init(struct domain *d);
-extern int shadow_direct_map_fault(
-    unsigned long vpa, struct cpu_user_regs *regs);
-extern void shadow_mode_init(void);
-extern int shadow_mode_control(struct domain *p, dom0_shadow_control_t *sc);
-extern int shadow_fault(unsigned long va, struct cpu_user_regs *regs);
-extern int shadow_mode_enable(struct domain *p, unsigned int mode);
-extern void shadow_invlpg(struct vcpu *, unsigned long);
-extern struct out_of_sync_entry *shadow_mark_mfn_out_of_sync(
-    struct vcpu *v, unsigned long gpfn, unsigned long mfn);
-extern void free_monitor_pagetable(struct vcpu *v);
-extern void __shadow_sync_all(struct domain *d);
-extern int __shadow_out_of_sync(struct vcpu *v, unsigned long va);
-extern int set_p2m_entry(
-    struct domain *d, unsigned long pfn, unsigned long mfn,
-    struct domain_mmap_cache *l2cache,
-    struct domain_mmap_cache *l1cache);
-extern void remove_shadow(struct domain *d, unsigned long gpfn, u32 stype);
-
-extern void free_shadow_page(unsigned long smfn);
-
-extern void shadow_l1_normal_pt_update(struct domain *d,
-                                       paddr_t pa, l1_pgentry_t l1e,
-                                       struct domain_mmap_cache *cache);
-extern void shadow_l2_normal_pt_update(struct domain *d,
-                                       paddr_t pa, l2_pgentry_t l2e,
-                                       struct domain_mmap_cache *cache);
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/page-guest32.h>
-/*
- * va_mask cannot be used because it's used by the shadow hash.
- * Use the score area for for now.
- */
-#define is_xen_l2_slot(t,s)                                                    
\
-    ( ((((t) & PGT_score_mask) >> PGT_score_shift) == 3) &&                    
\
-      ((s) >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES - 1))) )
-
-extern unsigned long gva_to_gpa(unsigned long gva);
-extern void shadow_l3_normal_pt_update(struct domain *d,
-                                       paddr_t pa, l3_pgentry_t l3e,
-                                       struct domain_mmap_cache *cache);
-#endif
-#if CONFIG_PAGING_LEVELS >= 4
-extern void shadow_l4_normal_pt_update(struct domain *d,
-                                       paddr_t pa, l4_pgentry_t l4e,
-                                       struct domain_mmap_cache *cache);
-#endif
-extern int shadow_do_update_va_mapping(unsigned long va,
-                                       l1_pgentry_t val,
-                                       struct vcpu *v);
-
-
-static inline unsigned long __shadow_status(
-    struct domain *d, unsigned long gpfn, unsigned long stype);
-
-#if CONFIG_PAGING_LEVELS <= 2
-static inline void update_hl2e(struct vcpu *v, unsigned long va);
-#endif
-
-static inline int page_is_page_table(struct page_info *page)
-{
-    struct domain *owner = page_get_owner(page);
-    u32 type_info;
-
-    if ( owner && shadow_mode_refcounts(owner) )
-        return page->count_info & PGC_page_table;
-
-    type_info = page->u.inuse.type_info & PGT_type_mask;
-    return type_info && (type_info <= PGT_l4_page_table);
-}
-
-static inline int mfn_is_page_table(unsigned long mfn)
-{
-    if ( !mfn_valid(mfn) )
-        return 0;
-
-    return page_is_page_table(mfn_to_page(mfn));
-}
-
-static inline int page_out_of_sync(struct page_info *page)
-{
-    return page->count_info & PGC_out_of_sync;
-}
-
-static inline int mfn_out_of_sync(unsigned long mfn)
-{
-    if ( !mfn_valid(mfn) )
-        return 0;
-
-    return page_out_of_sync(mfn_to_page(mfn));
-}
-
-
-/************************************************************************/
-
-static void inline
-__shadow_sync_mfn(struct domain *d, unsigned long mfn)
-{
-    if ( d->arch.out_of_sync )
-    {
-        // XXX - could be smarter
-        //
-        __shadow_sync_all(d);
-    }
-}
-
-static void inline
-__shadow_sync_va(struct vcpu *v, unsigned long va)
-{
-    struct domain *d = v->domain;
-
-    if ( d->arch.out_of_sync && __shadow_out_of_sync(v, va) )
-    {
-        perfc_incrc(shadow_sync_va);
-
-        // XXX - could be smarter
-        //
-        __shadow_sync_all(v->domain);
-    }
-#if CONFIG_PAGING_LEVELS <= 2
-    // Also make sure the HL2 is up-to-date for this address.
-    //
-    if ( unlikely(shadow_mode_translate(v->domain)) )
-        update_hl2e(v, va);
-#endif
-}
-
-static void inline
-shadow_sync_all(struct domain *d)
-{
-    if ( unlikely(shadow_mode_enabled(d)) )
-    {
-        shadow_lock(d);
-
-        if ( d->arch.out_of_sync )
-            __shadow_sync_all(d);
-
-        ASSERT(d->arch.out_of_sync == NULL);
-
-        shadow_unlock(d);
-    }
-}
-
-// SMP BUG: This routine can't ever be used properly in an SMP context.
-//          It should be something like get_shadow_and_sync_va().
-//          This probably shouldn't exist.
-//
-static void inline
-shadow_sync_va(struct vcpu *v, unsigned long gva)
-{
-    struct domain *d = v->domain;
-    if ( unlikely(shadow_mode_enabled(d)) )
-    {
-        shadow_lock(d);
-        __shadow_sync_va(v, gva);
-        shadow_unlock(d);
-    }
-}
-
-extern void __shadow_mode_disable(struct domain *d);
-static inline void shadow_mode_disable(struct domain *d)
-{
-    if ( unlikely(shadow_mode_enabled(d)) )
-    {
-        shadow_lock(d);
-        __shadow_mode_disable(d);
-        shadow_unlock(d);
-    }
-}
-
-/************************************************************************/
-
-#define mfn_to_gmfn(_d, mfn)                         \
-    ( (shadow_mode_translate(_d))                      \
-      ? get_gpfn_from_mfn(mfn)                          \
-      : (mfn) )
-
-#define gmfn_to_mfn(_d, gpfn)                        \
-    ({                                                 \
-        unlikely(shadow_mode_translate(_d))            \
-        ? (likely(current->domain == (_d))             \
-           ? get_mfn_from_gpfn(gpfn)                    \
-           : get_mfn_from_gpfn_foreign(_d, gpfn))       \
-        : (gpfn);                                      \
-    })
-
-extern unsigned long get_mfn_from_gpfn_foreign(
-    struct domain *d, unsigned long gpfn);
-
-/************************************************************************/
-
-struct shadow_status {
-    struct shadow_status *next;   /* Pull-to-front list per hash bucket. */
-    shadow_key_t  gpfn_and_flags; /* Guest pfn plus flags. */
-    unsigned long smfn;           /* Shadow mfn.           */
-};
-
-#define shadow_ht_extra_size 128
-#define shadow_ht_buckets    256
-
-struct out_of_sync_entry {
-    struct out_of_sync_entry *next;
-    struct vcpu   *v;
-    unsigned long gpfn;    /* why is this here? */
-    unsigned long gmfn;
-    unsigned long snapshot_mfn;
-    paddr_t writable_pl1e; /* NB: this is a machine address */
-    unsigned long va;
-};
-
-#define out_of_sync_extra_size 127
-
-#define SHADOW_SNAPSHOT_ELSEWHERE (-1L)
-
-/************************************************************************/
-#define SHADOW_DEBUG 0
-#define SHADOW_VERBOSE_DEBUG 0
-#define SHADOW_VVERBOSE_DEBUG 0
-#define SHADOW_VVVERBOSE_DEBUG 0
-#define SHADOW_HASH_DEBUG 0
-#define FULLSHADOW_DEBUG 0
-
-#if SHADOW_DEBUG
-extern int shadow_status_noswap;
-#define SHADOW_REFLECTS_SNAPSHOT _PAGE_AVAIL0
-#endif
-
-#if SHADOW_VERBOSE_DEBUG
-#define SH_LOG(_f, _a...)                                               \
-    printk("DOM%uP%u: SH_LOG(%d): " _f "\n",                            \
-       current->domain->domain_id , smp_processor_id(), __LINE__ , ## _a )
-#define SH_VLOG(_f, _a...)                                              \
-    printk("DOM%uP%u: SH_VLOG(%d): " _f "\n",                           \
-           current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a )
-#else
-#define SH_LOG(_f, _a...) ((void)0)
-#define SH_VLOG(_f, _a...) ((void)0)
-#endif
-
-#if SHADOW_VVERBOSE_DEBUG
-#define SH_VVLOG(_f, _a...)                                             \
-    printk("DOM%uP%u: SH_VVLOG(%d): " _f "\n",                          \
-           current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a )
-#else
-#define SH_VVLOG(_f, _a...) ((void)0)
-#endif
-
-#if SHADOW_VVVERBOSE_DEBUG
-#define SH_VVVLOG(_f, _a...)                                            \
-    printk("DOM%uP%u: SH_VVVLOG(%d): " _f "\n",                         \
-           current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a )
-#else
-#define SH_VVVLOG(_f, _a...) ((void)0)
-#endif
-
-#if FULLSHADOW_DEBUG
-#define FSH_LOG(_f, _a...)                                              \
-    printk("DOM%uP%u: FSH_LOG(%d): " _f "\n",                           \
-           current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a )
-#else
-#define FSH_LOG(_f, _a...) ((void)0)
-#endif
-
-
-/************************************************************************/
-
-static inline int
-shadow_get_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
-{
-    l1_pgentry_t nl1e;
-    int res;
-    unsigned long mfn;
-    struct domain *owner;
-
-    ASSERT(l1e_get_flags(l1e) & _PAGE_PRESENT);
-
-    if ( !shadow_mode_refcounts(d) )
-        return 1;
-
-    nl1e = l1e;
-    l1e_remove_flags(nl1e, _PAGE_GLOBAL);
-
-    if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
-        return 0;
-
-    res = get_page_from_l1e(nl1e, d);
-
-    if ( unlikely(!res) && IS_PRIV(d) && !shadow_mode_translate(d) &&
-         !(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) &&
-         (mfn = l1e_get_pfn(nl1e)) &&
-         mfn_valid(mfn) &&
-         (owner = page_get_owner(mfn_to_page(mfn))) &&
-         (d != owner) )
-    {
-        res = get_page_from_l1e(nl1e, owner);
-        printk("tried to map mfn %lx from domain %d into shadow page tables "
-               "of domain %d; %s\n",
-               mfn, owner->domain_id, d->domain_id,
-               res ? "success" : "failed");
-    }
-
-    if ( unlikely(!res) )
-    {
-        perfc_incrc(shadow_get_page_fail);
-        FSH_LOG("%s failed to get ref l1e=%" PRIpte "\n",
-                __func__, l1e_get_intpte(l1e));
-    }
-
-    return res;
-}
-
-static inline void
-shadow_put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
-{
-    if ( !shadow_mode_refcounts(d) )
-        return;
-
-    put_page_from_l1e(l1e, d);
-}
-
-static inline void
-shadow_put_page_type(struct domain *d, struct page_info *page)
-{
-    if ( !shadow_mode_refcounts(d) )
-        return;
-
-    put_page_type(page);
-}
-
-static inline int shadow_get_page(struct domain *d,
-                                  struct page_info *page,
-                                  struct domain *owner)
-{
-    if ( !shadow_mode_refcounts(d) )
-        return 1;
-    return get_page(page, owner);
-}
-
-static inline void shadow_put_page(struct domain *d,
-                                   struct page_info *page)
-{
-    if ( !shadow_mode_refcounts(d) )
-        return;
-    put_page(page);
-}
-
-/************************************************************************/
-
-static inline void __mark_dirty(struct domain *d, unsigned long mfn)
-{
-    unsigned long pfn;
-
-    ASSERT(shadow_lock_is_acquired(d));
-
-    if ( likely(!shadow_mode_log_dirty(d)) || !VALID_MFN(mfn) )
-        return;
-
-    ASSERT(d->arch.shadow_dirty_bitmap != NULL);
-
-    /* We /really/ mean PFN here, even for non-translated guests. */
-    pfn = get_gpfn_from_mfn(mfn);
-
-    /*
-     * Values with the MSB set denote MFNs that aren't really part of the 
-     * domain's pseudo-physical memory map (e.g., the shared info frame).
-     * Nothing to do here...
-     */
-    if ( unlikely(IS_INVALID_M2P_ENTRY(pfn)) )
-        return;
-
-    /* N.B. Can use non-atomic TAS because protected by shadow_lock. */
-    if ( likely(pfn < d->arch.shadow_dirty_bitmap_size) &&
-         !__test_and_set_bit(pfn, d->arch.shadow_dirty_bitmap) )
-    {
-        d->arch.shadow_dirty_count++;
-    }
-#ifndef NDEBUG
-    else if ( mfn_valid(mfn) )
-    {
-        SH_VLOG("mark_dirty OOR! mfn=%lx pfn=%lx max=%x (dom %p)",
-               mfn, pfn, d->arch.shadow_dirty_bitmap_size, d);
-        SH_VLOG("dom=%p caf=%08x taf=%" PRtype_info, 
-                page_get_owner(mfn_to_page(mfn)),
-                mfn_to_page(mfn)->count_info, 
-                mfn_to_page(mfn)->u.inuse.type_info );
-    }
-#endif
-}
-
-
-static inline void mark_dirty(struct domain *d, unsigned int mfn)
-{
-    if ( unlikely(shadow_mode_log_dirty(d)) )
-    {
-        shadow_lock(d);
-        __mark_dirty(d, mfn);
-        shadow_unlock(d);
-    }
-}
-
-
-/************************************************************************/
-#if CONFIG_PAGING_LEVELS <= 2
-static inline void
-__shadow_get_l2e(
-    struct vcpu *v, unsigned long va, l2_pgentry_t *psl2e)
-{
-    ASSERT(shadow_mode_enabled(v->domain));
-
-    *psl2e = v->arch.shadow_vtable[l2_table_offset(va)];
-}
-
-static inline void
-__shadow_set_l2e(
-    struct vcpu *v, unsigned long va, l2_pgentry_t value)
-{
-    ASSERT(shadow_mode_enabled(v->domain));
-
-    v->arch.shadow_vtable[l2_table_offset(va)] = value;
-}
-
-static inline void
-__guest_get_l2e(
-    struct vcpu *v, unsigned long va, l2_pgentry_t *pl2e)
-{
-    *pl2e = v->arch.guest_vtable[l2_table_offset(va)];
-}
-
-static inline void
-__guest_set_l2e(
-    struct vcpu *v, unsigned long va, l2_pgentry_t value)
-{
-    struct domain *d = v->domain;
-
-    v->arch.guest_vtable[l2_table_offset(va)] = value;
-
-    if ( unlikely(shadow_mode_translate(d)) )
-        update_hl2e(v, va);
-
-    __mark_dirty(d, pagetable_get_pfn(v->arch.guest_table));
-}
-
-static inline void
-__direct_get_l2e(
-    struct vcpu *v, unsigned long va, l2_pgentry_t *psl2e)
-{
-    l2_pgentry_t *phys_vtable;
-
-    ASSERT(shadow_mode_enabled(v->domain));
-
-    phys_vtable = map_domain_page(
-        pagetable_get_pfn(v->domain->arch.phys_table));
-
-    *psl2e = phys_vtable[l2_table_offset(va)];
-
-    unmap_domain_page(phys_vtable);
-}
-
-static inline void
-__direct_set_l2e(
-    struct vcpu *v, unsigned long va, l2_pgentry_t value)
-{
-    l2_pgentry_t *phys_vtable;
-
-    ASSERT(shadow_mode_enabled(v->domain));
-
-    phys_vtable = map_domain_page(
-        pagetable_get_pfn(v->domain->arch.phys_table));
-
-    phys_vtable[l2_table_offset(va)] = value;
-
-    unmap_domain_page(phys_vtable);
-}
-
-static inline void
-update_hl2e(struct vcpu *v, unsigned long va)
-{
-    int index = l2_table_offset(va);
-    unsigned long mfn;
-    l2_pgentry_t gl2e = v->arch.guest_vtable[index];
-    l1_pgentry_t old_hl2e, new_hl2e;
-    int need_flush = 0;
-
-    ASSERT(shadow_mode_translate(v->domain));
-
-    old_hl2e = v->arch.hl2_vtable[index];
-
-    if ( (l2e_get_flags(gl2e) & _PAGE_PRESENT) &&
-         VALID_MFN(mfn = get_mfn_from_gpfn(l2e_get_pfn(gl2e))) )
-        new_hl2e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
-    else
-        new_hl2e = l1e_empty();
-
-    // only do the ref counting if something has changed.
-    //
-    if ( (l1e_has_changed(old_hl2e, new_hl2e, PAGE_FLAG_MASK)) )
-    {
-        if ( (l1e_get_flags(new_hl2e) & _PAGE_PRESENT) &&
-             !shadow_get_page(v->domain, mfn_to_page(l1e_get_pfn(new_hl2e)),
-                              v->domain) )
-            new_hl2e = l1e_empty();
-        if ( l1e_get_flags(old_hl2e) & _PAGE_PRESENT )
-        {
-            shadow_put_page(v->domain, mfn_to_page(l1e_get_pfn(old_hl2e)));
-            need_flush = 1;
-        }
-
-        v->arch.hl2_vtable[l2_table_offset(va)] = new_hl2e;
-
-        if ( need_flush )
-        {
-            perfc_incrc(update_hl2e_invlpg);
-            flush_tlb_one_mask(v->domain->domain_dirty_cpumask,
-                               &linear_pg_table[l1_linear_offset(va)]);
-        }
-    }
-}
-
-static inline void shadow_drop_references(
-    struct domain *d, struct page_info *page)
-{
-    if ( likely(!shadow_mode_refcounts(d)) ||
-         ((page->u.inuse.type_info & PGT_count_mask) == 0) )
-        return;
-
-    /* XXX This needs more thought... */
-    printk("%s: needing to call shadow_remove_all_access for mfn=%lx\n",
-           __func__, page_to_mfn(page));
-    printk("Before: mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_mfn(page),
-           page->count_info, page->u.inuse.type_info);
-
-    shadow_lock(d);
-    shadow_remove_all_access(d, page_to_mfn(page));
-    shadow_unlock(d);
-
-    printk("After:  mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_mfn(page),
-           page->count_info, page->u.inuse.type_info);
-}
-
-/* XXX Needs more thought. Neither pretty nor fast: a place holder. */
-static inline void shadow_sync_and_drop_references(
-    struct domain *d, struct page_info *page)
-{
-    if ( likely(!shadow_mode_refcounts(d)) )
-        return;
-
-    if ( page_out_of_sync(page) )
-        __shadow_sync_mfn(d, page_to_mfn(page));
-
-    shadow_remove_all_access(d, page_to_mfn(page));
-}
-#endif
-
-/************************************************************************/
-
-/*
- * Add another shadow reference to smfn.
- */
-static inline int
-get_shadow_ref(unsigned long smfn)
-{
-    u32 x, nx;
-
-    ASSERT(mfn_valid(smfn));
-
-    x = mfn_to_page(smfn)->count_info;
-    nx = x + 1;
-
-    if ( unlikely(nx == 0) )
-    {
-        printk("get_shadow_ref overflow, gmfn=%" PRtype_info  " smfn=%lx\n",
-               mfn_to_page(smfn)->u.inuse.type_info & PGT_mfn_mask,
-               smfn);
-        BUG();
-    }
-    
-    // Guarded by the shadow lock...
-    //
-    mfn_to_page(smfn)->count_info = nx;
-
-    return 1;
-}
-
-/*
- * Drop a shadow reference to smfn.
- */
-static inline void
-put_shadow_ref(unsigned long smfn)
-{
-    u32 x, nx;
-
-    ASSERT(mfn_valid(smfn));
-
-    x = mfn_to_page(smfn)->count_info;
-    nx = x - 1;
-
-    if ( unlikely(x == 0) )
-    {
-        printk("put_shadow_ref underflow, smfn=%lx oc=%08x t=%" 
-               PRtype_info "\n",
-               smfn,
-               mfn_to_page(smfn)->count_info,
-               mfn_to_page(smfn)->u.inuse.type_info);
-        BUG();
-    }
-
-    // Guarded by the shadow lock...
-    //
-    mfn_to_page(smfn)->count_info = nx;
-
-    if ( unlikely(nx == 0) )
-    {
-        free_shadow_page(smfn);
-    }
-}
-
-static inline void
-shadow_pin(unsigned long smfn)
-{
-    ASSERT( !(mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) );
-
-    mfn_to_page(smfn)->u.inuse.type_info |= PGT_pinned;
-    if ( unlikely(!get_shadow_ref(smfn)) )
-        BUG();
-}
-
-static inline void
-shadow_unpin(unsigned long smfn)
-{
-    ASSERT( (mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) );
-
-    mfn_to_page(smfn)->u.inuse.type_info &= ~PGT_pinned;
-    put_shadow_ref(smfn);
-}
-
-/*
- * SMP issue. The following code assumes the shadow lock is held. Re-visit
- * when working on finer-gained locks for shadow.
- */
-static inline void set_guest_back_ptr(
-    struct domain *d, l1_pgentry_t spte,
-    unsigned long smfn, unsigned int index)
-{
-    struct page_info *gpage;
-
-    ASSERT(shadow_lock_is_acquired(d));
-
-    if ( !shadow_mode_external(d) || 
-         ((l1e_get_flags(spte) & (_PAGE_PRESENT|_PAGE_RW)) !=
-          (_PAGE_PRESENT|_PAGE_RW)) )
-        return;
-
-    gpage = l1e_get_page(spte);
-
-    ASSERT(smfn != 0);
-    ASSERT(page_to_mfn(gpage) != 0);
-
-    gpage->tlbflush_timestamp = smfn;
-    gpage->u.inuse.type_info &= ~PGT_va_mask;
-    gpage->u.inuse.type_info |= (unsigned long)index << PGT_va_shift;
-}
-
-/************************************************************************/
-#if CONFIG_PAGING_LEVELS <= 2
-extern void shadow_mark_va_out_of_sync(
-    struct vcpu *v, unsigned long gpfn, unsigned long mfn,
-    unsigned long va);
-
-static inline int l1pte_write_fault(
-    struct vcpu *v, l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p,
-    unsigned long va)
-{
-    struct domain *d = v->domain;
-    l1_pgentry_t gpte = *gpte_p;
-    l1_pgentry_t spte;
-    unsigned long gpfn = l1e_get_pfn(gpte);
-    unsigned long gmfn = gmfn_to_mfn(d, gpfn);
-
-    //printk("l1pte_write_fault gmfn=%lx\n", gmfn);
-
-    if ( unlikely(!VALID_MFN(gmfn)) )
-    {
-        SH_VLOG("l1pte_write_fault: invalid gpfn=%lx", gpfn);
-        *spte_p = l1e_empty();
-        return 0;
-    }
-
-    ASSERT(l1e_get_flags(gpte) & _PAGE_RW);
-    l1e_add_flags(gpte, _PAGE_DIRTY | _PAGE_ACCESSED);
-    spte = l1e_from_pfn(gmfn, l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
-
-    SH_VVLOG("l1pte_write_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte,
-             l1e_get_intpte(spte), l1e_get_intpte(gpte));
-
-    __mark_dirty(d, gmfn);
-
-    if ( mfn_is_page_table(gmfn) )
-        shadow_mark_va_out_of_sync(v, gpfn, gmfn, va);
-
-    *gpte_p = gpte;
-    *spte_p = spte;
-
-    return 1;
-}
-
-static inline int l1pte_read_fault(
-    struct domain *d, l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p)
-{ 
-    l1_pgentry_t gpte = *gpte_p;
-    l1_pgentry_t spte = *spte_p;
-    unsigned long pfn = l1e_get_pfn(gpte);
-    unsigned long mfn = gmfn_to_mfn(d, pfn);
-
-    if ( unlikely(!VALID_MFN(mfn)) )
-    {
-        SH_VLOG("l1pte_read_fault: invalid gpfn=%lx", pfn);
-        *spte_p = l1e_empty();
-        return 0;
-    }
-
-    l1e_add_flags(gpte, _PAGE_ACCESSED);
-    spte = l1e_from_pfn(mfn, l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
-
-    if ( shadow_mode_log_dirty(d) || !(l1e_get_flags(gpte) & _PAGE_DIRTY) ||
-         mfn_is_page_table(mfn) )
-    {
-        l1e_remove_flags(spte, _PAGE_RW);
-    }
-
-    SH_VVLOG("l1pte_read_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte,
-             l1e_get_intpte(spte), l1e_get_intpte(gpte));
-    *gpte_p = gpte;
-    *spte_p = spte;
-
-    return 1;
-}
-#endif
-
-static inline void l1pte_propagate_from_guest(
-    struct domain *d, guest_l1_pgentry_t gpte, l1_pgentry_t *spte_p)
-{ 
-    unsigned long mfn;
-    l1_pgentry_t spte;
-
-    spte = l1e_empty();
-
-    if ( ((guest_l1e_get_flags(gpte) & (_PAGE_PRESENT|_PAGE_ACCESSED) ) ==
-          (_PAGE_PRESENT|_PAGE_ACCESSED)) &&
-         VALID_MFN(mfn = gmfn_to_mfn(d, l1e_get_pfn(gpte))) )
-    {
-        spte = l1e_from_pfn(
-            mfn, guest_l1e_get_flags(gpte) & ~(_PAGE_GLOBAL | _PAGE_AVAIL));
-
-        if ( shadow_mode_log_dirty(d) ||
-             !(guest_l1e_get_flags(gpte) & _PAGE_DIRTY) ||
-             mfn_is_page_table(mfn) )
-        {
-            l1e_remove_flags(spte, _PAGE_RW);
-        }
-    }
-
-    if ( l1e_get_intpte(spte) || l1e_get_intpte(gpte) )
-        SH_VVVLOG("%s: gpte=%" PRIpte ", new spte=%" PRIpte,
-                  __func__, l1e_get_intpte(gpte), l1e_get_intpte(spte));
-
-    *spte_p = spte;
-}
-
-static inline void hl2e_propagate_from_guest(
-    struct domain *d, l2_pgentry_t gpde, l1_pgentry_t *hl2e_p)
-{
-    unsigned long pfn = l2e_get_pfn(gpde);
-    unsigned long mfn;
-    l1_pgentry_t hl2e;
-    
-    hl2e = l1e_empty();
-
-    if ( l2e_get_flags(gpde) & _PAGE_PRESENT )
-    {
-        mfn = gmfn_to_mfn(d, pfn);
-        if ( VALID_MFN(mfn) && mfn_valid(mfn) )
-            hl2e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
-    }
-
-    if ( l1e_get_intpte(hl2e) || l2e_get_intpte(gpde) )
-        SH_VVLOG("%s: gpde=%" PRIpte " hl2e=%" PRIpte, __func__,
-                 l2e_get_intpte(gpde), l1e_get_intpte(hl2e));
-
-    *hl2e_p = hl2e;
-}
-
-static inline void l2pde_general(
-    struct domain *d,
-    guest_l2_pgentry_t *gpde_p,
-    l2_pgentry_t *spde_p,
-    unsigned long sl1mfn)
-{
-    guest_l2_pgentry_t gpde = *gpde_p;
-    l2_pgentry_t spde;
-
-    spde = l2e_empty();
-    if ( (guest_l2e_get_flags(gpde) & _PAGE_PRESENT) && (sl1mfn != 0) )
-    {
-        spde = l2e_from_pfn(
-            sl1mfn,
-            (guest_l2e_get_flags(gpde) | _PAGE_RW | _PAGE_ACCESSED) & 
~_PAGE_AVAIL);
-
-        /* N.B. PDEs do not have a dirty bit. */
-        guest_l2e_add_flags(gpde, _PAGE_ACCESSED);
-
-        *gpde_p = gpde;
-    } 
-
-    if ( l2e_get_intpte(spde) || l2e_get_intpte(gpde) )
-        SH_VVLOG("%s: gpde=%" PRIpte ", new spde=%" PRIpte, __func__,
-                 l2e_get_intpte(gpde), l2e_get_intpte(spde));
-
-    *spde_p = spde;
-}
-
-static inline void l2pde_propagate_from_guest(
-    struct domain *d, guest_l2_pgentry_t *gpde_p, l2_pgentry_t *spde_p)
-{
-    guest_l2_pgentry_t gpde = *gpde_p;
-    unsigned long sl1mfn = 0;
-
-    if ( guest_l2e_get_flags(gpde) & _PAGE_PRESENT )
-        sl1mfn =  __shadow_status(d, l2e_get_pfn(gpde), PGT_l1_shadow);
-    l2pde_general(d, gpde_p, spde_p, sl1mfn);
-}
-    
-/************************************************************************/
-
-// returns true if a tlb flush is needed
-//
-static int inline
-validate_pte_change(
-    struct domain *d,
-    guest_l1_pgentry_t new_pte,
-    l1_pgentry_t *shadow_pte_p)
-{
-    l1_pgentry_t old_spte, new_spte;
-    int need_flush = 0;
-
-    perfc_incrc(validate_pte_calls);
-
-    l1pte_propagate_from_guest(d, new_pte, &new_spte);
-
-    if ( shadow_mode_refcounts(d) )
-    {
-        old_spte = *shadow_pte_p;
-
-        if ( l1e_get_intpte(old_spte) == l1e_get_intpte(new_spte) )
-        {
-            // No accounting required...
-            //
-            perfc_incrc(validate_pte_changes1);
-        }
-        else if ( l1e_get_intpte(old_spte) == 
(l1e_get_intpte(new_spte)|_PAGE_RW) )
-        {
-            // Fast path for PTEs that have merely been write-protected
-            // (e.g., during a Unix fork()). A strict reduction in privilege.
-            //
-            perfc_incrc(validate_pte_changes2);
-            if ( likely(l1e_get_flags(new_spte) & _PAGE_PRESENT) )
-                shadow_put_page_type(d, mfn_to_page(l1e_get_pfn(new_spte)));
-        }
-        else if ( ((l1e_get_flags(old_spte) | l1e_get_flags(new_spte)) &
-                   _PAGE_PRESENT ) &&
-                  l1e_has_changed(old_spte, new_spte, _PAGE_RW | 
_PAGE_PRESENT) )
-        {
-            // only do the ref counting if something important changed.
-            //
-            perfc_incrc(validate_pte_changes3);
-
-            if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
-            {
-                shadow_put_page_from_l1e(old_spte, d);
-                need_flush = 1;
-            }
-            if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) &&
-                 !shadow_get_page_from_l1e(new_spte, d) ) {
-                new_spte = l1e_empty();
-                need_flush = -1; /* need to unshadow the page */
-            }
-        }
-        else
-        {
-            perfc_incrc(validate_pte_changes4);
-        }
-    }
-
-    *shadow_pte_p = new_spte;
-
-    return need_flush;
-}
-
-// returns true if a tlb flush is needed
-//
-static int inline
-validate_hl2e_change(
-    struct domain *d,
-    l2_pgentry_t new_gpde,
-    l1_pgentry_t *shadow_hl2e_p)
-{
-    l1_pgentry_t old_hl2e, new_hl2e;
-    int need_flush = 0;
-
-    perfc_incrc(validate_hl2e_calls);
-
-    old_hl2e = *shadow_hl2e_p;
-    hl2e_propagate_from_guest(d, new_gpde, &new_hl2e);
-
-    // Only do the ref counting if something important changed.
-    //
-    if ( ((l1e_get_flags(old_hl2e) | l1e_get_flags(new_hl2e)) & _PAGE_PRESENT) 
&&
-         l1e_has_changed(old_hl2e, new_hl2e, _PAGE_PRESENT) )
-    {
-        perfc_incrc(validate_hl2e_changes);
-
-        if ( (l1e_get_flags(new_hl2e) & _PAGE_PRESENT) &&
-             !get_page(mfn_to_page(l1e_get_pfn(new_hl2e)), d) )
-            new_hl2e = l1e_empty();
-        if ( l1e_get_flags(old_hl2e) & _PAGE_PRESENT )
-        {
-            put_page(mfn_to_page(l1e_get_pfn(old_hl2e)));
-            need_flush = 1;
-        }
-    }
-
-    *shadow_hl2e_p = new_hl2e;
-
-    return need_flush;
-}
-
-// returns true if a tlb flush is needed
-//
-static int inline
-validate_pde_change(
-    struct domain *d,
-    guest_l2_pgentry_t new_gpde,
-    l2_pgentry_t *shadow_pde_p)
-{
-    l2_pgentry_t old_spde, new_spde;
-    int need_flush = 0;
-
-    perfc_incrc(validate_pde_calls);
-
-    old_spde = *shadow_pde_p;
-    l2pde_propagate_from_guest(d, &new_gpde, &new_spde);
-
-    // Only do the ref counting if something important changed.
-    //
-    if ( ((l2e_get_intpte(old_spde) | l2e_get_intpte(new_spde)) & 
_PAGE_PRESENT) &&
-         l2e_has_changed(old_spde, new_spde, _PAGE_PRESENT) )
-    {
-        perfc_incrc(validate_pde_changes);
-
-        if ( (l2e_get_flags(new_spde) & _PAGE_PRESENT) &&
-             !get_shadow_ref(l2e_get_pfn(new_spde)) )
-            BUG();
-        if ( l2e_get_flags(old_spde) & _PAGE_PRESENT )
-        {
-            put_shadow_ref(l2e_get_pfn(old_spde));
-            need_flush = 1;
-        }
-    }
-
-    *shadow_pde_p = new_spde;
-
-    return need_flush;
-}
-
-/*********************************************************************/
-
-#if SHADOW_HASH_DEBUG
-
-static void shadow_audit(struct domain *d, int print)
-{
-    int live = 0, free = 0, j = 0, abs;
-    struct shadow_status *a;
-
-    for ( j = 0; j < shadow_ht_buckets; j++ )
-    {
-        a = &d->arch.shadow_ht[j];        
-        if ( a->gpfn_and_flags )
-        {
-            live++;
-            ASSERT(a->smfn);
-        }
-        else
-            ASSERT(!a->next);
-
-        a = a->next;
-        while ( a && (live < 9999) )
-        { 
-            live++;
-            if ( (a->gpfn_and_flags == 0) || (a->smfn == 0) )
-            {
-                printk("XXX live=%d gpfn+flags=%lx sp=%lx next=%p\n",
-                       live, a->gpfn_and_flags, a->smfn, a->next);
-                BUG();
-            }
-            ASSERT(a->smfn);
-            a = a->next;
-        }
-        ASSERT(live < 9999);
-    }
-
-    for ( a = d->arch.shadow_ht_free; a != NULL; a = a->next )
-        free++;
-
-    if ( print )
-        printk("Xlive=%d free=%d\n", live, free);
-
-    // BUG: this only works if there's only a single domain which is
-    //      using shadow tables.
-    //
-    abs = (
-        perfc_value(shadow_l1_pages) +
-        perfc_value(shadow_l2_pages) +
-        perfc_value(hl2_table_pages) +
-        perfc_value(snapshot_pages) +
-        perfc_value(writable_pte_predictions)
-        ) - live;
-#ifdef PERF_COUNTERS
-    if ( (abs < -1) || (abs > 1) )
-    {
-        printk("live=%d free=%d l1=%d l2=%d hl2=%d snapshot=%d 
writable_ptes=%d\n",
-               live, free,
-               perfc_value(shadow_l1_pages),
-               perfc_value(shadow_l2_pages),
-               perfc_value(hl2_table_pages),
-               perfc_value(snapshot_pages),
-               perfc_value(writable_pte_predictions));
-        BUG();
-    }
-#endif
-
-    // XXX ought to add some code to audit the out-of-sync entries, too.
-    //
-}
-#else
-#define shadow_audit(p, print) ((void)0)
-#endif
-
-
-static inline struct shadow_status *hash_bucket(
-    struct domain *d, unsigned int gpfn)
-{
-    return &d->arch.shadow_ht[gpfn % shadow_ht_buckets];
-}
-
-
-/*
- * N.B. This takes a guest pfn (i.e. a pfn in the guest's namespace,
- *      which, depending on full shadow mode, may or may not equal
- *      its mfn).
- *      It returns the shadow's mfn, or zero if it doesn't exist.
- */
-static inline unsigned long __shadow_status(
-    struct domain *d, unsigned long gpfn, unsigned long stype)
-{
-    struct shadow_status *p, *x, *head;
-    shadow_key_t key;
-#if CONFIG_PAGING_LEVELS >= 3
-    if ( d->arch.ops->guest_paging_levels == PAGING_L3 && stype == 
PGT_l4_shadow )
-        key = gpfn | stype | index_to_key(get_cr3_idxval(current));
-    else
-#endif
-        key = gpfn | stype;
-
-    ASSERT(shadow_lock_is_acquired(d));
-    ASSERT(gpfn == (gpfn & PGT_mfn_mask));
-    ASSERT(stype && !(stype & ~PGT_type_mask));
-
-    perfc_incrc(shadow_status_calls);
-
-    x = head = hash_bucket(d, gpfn);
-    p = NULL;
-
-    shadow_audit(d, 0);
-
-    do
-    {
-        ASSERT(x->gpfn_and_flags || ((x == head) && (x->next == NULL)));
-
-        if ( x->gpfn_and_flags == key )
-        {
-#if SHADOW_DEBUG
-            if ( unlikely(shadow_status_noswap) )
-                return x->smfn;
-#endif
-            /* Pull-to-front if 'x' isn't already the head item. */
-            if ( unlikely(x != head) )
-            {
-                /* Delete 'x' from list and reinsert immediately after head. */
-                p->next = x->next;
-                x->next = head->next;
-                head->next = x;
-
-                /* Swap 'x' contents with head contents. */
-                SWAP(head->gpfn_and_flags, x->gpfn_and_flags);
-                SWAP(head->smfn, x->smfn);
-            }
-            else
-            {
-                perfc_incrc(shadow_status_hit_head);
-            }
-
-            return head->smfn;
-        }
-
-        p = x;
-        x = x->next;
-    }
-    while ( x != NULL );
-
-    perfc_incrc(shadow_status_miss);
-    return 0;
-}
-
-/*
- * Not clear if pull-to-front is worth while for this or not,
- * as it generally needs to scan the entire bucket anyway.
- * Much simpler without.
- *
- * Either returns PGT_none, or PGT_l{1,2,3,4}_page_table.
- */
-static inline u32
-shadow_max_pgtable_type(struct domain *d, unsigned long gpfn,
-                        unsigned long *smfn)
-{
-    struct shadow_status *x;
-    u32 pttype = PGT_none, type;
-
-    ASSERT(shadow_lock_is_acquired(d));
-    ASSERT(gpfn == (gpfn & PGT_mfn_mask));
-
-    perfc_incrc(shadow_max_type);
-
-    x = hash_bucket(d, gpfn);
-
-    while ( x && x->gpfn_and_flags )
-    {
-        if ( (x->gpfn_and_flags & PGT_mfn_mask) == gpfn )
-        {
-            type = x->gpfn_and_flags & PGT_type_mask;
-
-            switch ( type )
-            {
-            case PGT_hl2_shadow:
-                // Treat an HL2 as if it's an L1
-                //
-                type = PGT_l1_shadow;
-                break;
-            case PGT_snapshot:
-            case PGT_writable_pred:
-                // Ignore snapshots -- they don't in and of themselves 
constitute
-                // treating a page as a page table
-                //
-                goto next;
-            case PGT_base_page_table:
-                // Early exit if we found the max possible value
-                //
-                return type;
-            default:
-                break;
-            }
-
-            if ( type > pttype )
-            {
-                pttype = type;
-                if ( smfn )
-                    *smfn = x->smfn;
-            }
-        }
-    next:
-        x = x->next;
-    }
-
-    return pttype;
-}
-
-static inline void delete_shadow_status(
-    struct domain *d, unsigned long gpfn, unsigned long gmfn, unsigned int 
stype, u64 index)
-{
-    struct shadow_status *p, *x, *n, *head;
-
-    shadow_key_t key = gpfn | stype | index_to_key(index);
-
-    ASSERT(shadow_lock_is_acquired(d));
-    ASSERT(!(gpfn & ~PGT_mfn_mask));
-    ASSERT(stype && !(stype & ~PGT_type_mask));
-
-    head = hash_bucket(d, gpfn);
-
-    SH_VLOG("delete gpfn=%lx t=%08x bucket=%p", gpfn, stype, head);
-    shadow_audit(d, 0);
-
-    /* Match on head item? */
-    if ( head->gpfn_and_flags == key )
-    {
-        if ( (n = head->next) != NULL )
-        {
-            /* Overwrite head with contents of following node. */
-            head->gpfn_and_flags = n->gpfn_and_flags;
-            head->smfn           = n->smfn;
-
-            /* Delete following node. */
-            head->next           = n->next;
-
-            /* Add deleted node to the free list. */
-            n->gpfn_and_flags = 0;
-            n->smfn           = 0;
-            n->next           = d->arch.shadow_ht_free;
-            d->arch.shadow_ht_free = n;
-        }
-        else
-        {
-            /* This bucket is now empty. Initialise the head node. */
-            head->gpfn_and_flags = 0;
-            head->smfn           = 0;
-        }
-
-        goto found;
-    }
-
-    p = head;
-    x = head->next;
-
-    do
-    {
-        if ( x->gpfn_and_flags == key )
-        {
-            /* Delete matching node. */
-            p->next = x->next;
-
-            /* Add deleted node to the free list. */
-            x->gpfn_and_flags = 0;
-            x->smfn           = 0;
-            x->next           = d->arch.shadow_ht_free;
-            d->arch.shadow_ht_free = x;
-
-            goto found;
-        }
-
-        p = x;
-        x = x->next;
-    }
-    while ( x != NULL );
-
-    /* If we got here, it wasn't in the list! */
-    BUG();
-
- found:
-    // release ref to page
-    if ( stype != PGT_writable_pred )
-        put_page(mfn_to_page(gmfn));
-
-    shadow_audit(d, 0);
-}
-
-static inline void set_shadow_status(
-    struct domain *d, unsigned long gpfn, unsigned long gmfn,
-    unsigned long smfn, unsigned long stype, u64 index)
-{
-    struct shadow_status *x, *head, *extra;
-    int i;
-
-    shadow_key_t key = gpfn | stype | index_to_key(index);
-
-    SH_VVLOG("set gpfn=%lx gmfn=%lx smfn=%lx t=%lx", gpfn, gmfn, smfn, stype);
-
-    ASSERT(shadow_lock_is_acquired(d));
-
-    ASSERT(shadow_mode_translate(d) || gpfn);
-    ASSERT(!(gpfn & ~PGT_mfn_mask));
-
-    // XXX - need to be more graceful.
-    ASSERT(VALID_MFN(gmfn));
-
-    ASSERT(stype && !(stype & ~PGT_type_mask));
-
-    x = head = hash_bucket(d, gpfn);
-
-    SH_VLOG("set gpfn=%lx smfn=%lx t=%lx bucket=%p(%p)",
-             gpfn, smfn, stype, x, x->next);
-    shadow_audit(d, 0);
-
-    // grab a reference to the guest page to represent the entry in the shadow
-    // hash table
-    //
-    // XXX - Should PGT_writable_pred grab a page ref?
-    //     - Who/how are these hash table entry refs flushed if/when a page
-    //       is given away by the domain?
-    //
-    if ( stype != PGT_writable_pred )
-        get_page(mfn_to_page(gmfn), d);
-
-    /*
-     * STEP 1. If page is already in the table, update it in place.
-     */
-    do
-    {
-        if ( unlikely(x->gpfn_and_flags == key) )
-        {
-            if ( stype != PGT_writable_pred )
-                BUG(); // we should never replace entries into the hash table
-            x->smfn = smfn;
-            if ( stype != PGT_writable_pred )
-                put_page(mfn_to_page(gmfn)); // already had a ref...
-            goto done;
-        }
-
-        x = x->next;
-    }
-    while ( x != NULL );
-
-    /*
-     * STEP 2. The page must be inserted into the table.
-     */
-
-    /* If the bucket is empty then insert the new page as the head item. */
-    if ( head->gpfn_and_flags == 0 )
-    {
-        head->gpfn_and_flags = key;
-        head->smfn           = smfn;
-        ASSERT(head->next == NULL);
-        goto done;
-    }
-
-    /* We need to allocate a new node. Ensure the quicklist is non-empty. */
-    if ( unlikely(d->arch.shadow_ht_free == NULL) )
-    {
-        SH_VLOG("Allocate more shadow hashtable blocks.");
-
-        extra = xmalloc_bytes(
-            sizeof(void *) + (shadow_ht_extra_size * sizeof(*x)));
-
-        /* XXX Should be more graceful here. */
-        if ( extra == NULL )
-            BUG();
-
-        memset(extra, 0, sizeof(void *) + (shadow_ht_extra_size * sizeof(*x)));
-
-        /* Record the allocation block so it can be correctly freed later. */
-        d->arch.shadow_extras_count++;
-        *((struct shadow_status **)&extra[shadow_ht_extra_size]) = 
-            d->arch.shadow_ht_extras;
-        d->arch.shadow_ht_extras = &extra[0];
-
-        /* Thread a free chain through the newly-allocated nodes. */
-        for ( i = 0; i < (shadow_ht_extra_size - 1); i++ )
-            extra[i].next = &extra[i+1];
-        extra[i].next = NULL;
-
-        /* Add the new nodes to the free list. */
-        d->arch.shadow_ht_free = &extra[0];
-    }
-
-    /* Allocate a new node from the quicklist. */
-    x                      = d->arch.shadow_ht_free;
-    d->arch.shadow_ht_free = x->next;
-
-    /* Initialise the new node and insert directly after the head item. */
-    x->gpfn_and_flags = key;
-    x->smfn           = smfn;
-    x->next           = head->next;
-    head->next        = x;
-
- done:
-    shadow_audit(d, 0);
-
-    if ( stype <= PGT_l4_shadow )
-    {
-        // add to front of list of pages to check when removing write
-        // permissions for a page...
-        //
-    }
-}
-
-/************************************************************************/
-
-static inline void guest_physmap_add_page(
-    struct domain *d, unsigned long gpfn, unsigned long mfn)
-{
-    struct domain_mmap_cache c1, c2;
-
-    if ( likely(!shadow_mode_translate(d)) )
-        return;
-
-    domain_mmap_cache_init(&c1);
-    domain_mmap_cache_init(&c2);
-    shadow_lock(d);
-    shadow_sync_and_drop_references(d, mfn_to_page(mfn));
-    set_p2m_entry(d, gpfn, mfn, &c1, &c2);
-    set_gpfn_from_mfn(mfn, gpfn);
-    shadow_unlock(d);
-    domain_mmap_cache_destroy(&c1);
-    domain_mmap_cache_destroy(&c2);
-}
-
-static inline void guest_physmap_remove_page(
-    struct domain *d, unsigned long gpfn, unsigned long mfn)
-{
-    struct domain_mmap_cache c1, c2;
-    unsigned long type;
-
-    if ( likely(!shadow_mode_translate(d)) )
-        return;
-
-    domain_mmap_cache_init(&c1);
-    domain_mmap_cache_init(&c2);
-    shadow_lock(d);
-    shadow_sync_and_drop_references(d, mfn_to_page(mfn));
-    while ( (type = shadow_max_pgtable_type(d, gpfn, NULL)) != PGT_none )
-        free_shadow_page(__shadow_status(d, gpfn, type));
-    set_p2m_entry(d, gpfn, -1, &c1, &c2);
-    set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
-    shadow_unlock(d);
-    domain_mmap_cache_destroy(&c1);
-    domain_mmap_cache_destroy(&c2);
-}
-
-/************************************************************************/
-
-void static inline
-shadow_update_min_max(unsigned long smfn, int index)
-{
-    struct page_info *sl1page = mfn_to_page(smfn);
-    u32 min_max = sl1page->tlbflush_timestamp;
-    int min = SHADOW_MIN(min_max);
-    int max = SHADOW_MAX(min_max);
-    int update = 0;
-
-    if ( index < min )
-    {
-        min = index;
-        update = 1;
-    }
-    if ( index > max )
-    {
-        max = index;
-        update = 1;
-    }
-    if ( update )
-        sl1page->tlbflush_timestamp = SHADOW_ENCODE_MIN_MAX(min, max);
-}
-
-#if CONFIG_PAGING_LEVELS <= 2
-extern void shadow_map_l1_into_current_l2(unsigned long va);
-
-void static inline
-shadow_set_l1e(unsigned long va, l1_pgentry_t new_spte, int create_l1_shadow)
-{
-    struct vcpu *v = current;
-    struct domain *d = v->domain;
-    l2_pgentry_t sl2e = {0};
-
-    __shadow_get_l2e(v, va, &sl2e);
-    if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
-    {
-        /*
-         * Either the L1 is not shadowed, or the shadow isn't linked into
-         * the current shadow L2.
-         */
-        if ( create_l1_shadow )
-        {
-            perfc_incrc(shadow_set_l1e_force_map);
-            shadow_map_l1_into_current_l2(va);
-        }
-        else /* check to see if it exists; if so, link it in */
-        {
-            l2_pgentry_t gpde = linear_l2_table(v)[l2_table_offset(va)];
-            unsigned long gl1pfn = l2e_get_pfn(gpde);
-            unsigned long sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow);
-
-            ASSERT( l2e_get_flags(gpde) & _PAGE_PRESENT );
-
-            if ( sl1mfn )
-            {
-                perfc_incrc(shadow_set_l1e_unlinked);
-                if ( !get_shadow_ref(sl1mfn) )
-                    BUG();
-                l2pde_general(d, &gpde, &sl2e, sl1mfn);
-                __guest_set_l2e(v, va, gpde);
-                __shadow_set_l2e(v, va, sl2e);
-            }
-            else
-            {
-                // no shadow exists, so there's nothing to do.
-                perfc_incrc(shadow_set_l1e_fail);
-                return;
-            }
-        }
-    }
-
-    __shadow_get_l2e(v, va, &sl2e);
-
-    if ( shadow_mode_refcounts(d) )
-    {
-        l1_pgentry_t old_spte = shadow_linear_pg_table[l1_linear_offset(va)];
-
-        // only do the ref counting if something important changed.
-        //
-        if ( l1e_has_changed(old_spte, new_spte, _PAGE_RW | _PAGE_PRESENT) )
-        {
-            if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) &&
-                 !shadow_get_page_from_l1e(new_spte, d) )
-                new_spte = l1e_empty();
-            if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
-                shadow_put_page_from_l1e(old_spte, d);
-        }
-
-    }
-
-    set_guest_back_ptr(d, new_spte, l2e_get_pfn(sl2e), l1_table_offset(va));
-    shadow_linear_pg_table[l1_linear_offset(va)] = new_spte;
-    shadow_update_min_max(l2e_get_pfn(sl2e), l1_table_offset(va));
-}
-#endif
-/************************************************************************/
-
-static inline int
-shadow_mode_page_writable(unsigned long va, struct cpu_user_regs *regs, 
unsigned long gpfn)
-{
-    struct vcpu *v = current;
-    struct domain *d = v->domain;
-    unsigned long mfn = gmfn_to_mfn(d, gpfn);
-    u32 type = mfn_to_page(mfn)->u.inuse.type_info & PGT_type_mask;
-
-    if ( shadow_mode_refcounts(d) &&
-         (type == PGT_writable_page) )
-        type = shadow_max_pgtable_type(d, gpfn, NULL);
-
-    // Strange but true: writable page tables allow kernel-mode access
-    // to L1 page table pages via write-protected PTEs...  Similarly, write 
-    // access to all page table pages is granted for shadow_mode_write_all
-    // clients.
-    //
-    if ( ((shadow_mode_write_l1(d) && (type == PGT_l1_page_table)) ||
-          (shadow_mode_write_all(d) && type && (type <= PGT_l4_page_table))) &&
-         ((va < HYPERVISOR_VIRT_START)
-#if defined(__x86_64__)
-          || (va >= HYPERVISOR_VIRT_END)
-#endif
-             ) &&
-         guest_kernel_mode(v, regs) )
-        return 1;
-
-    return 0;
-}
-
-#if CONFIG_PAGING_LEVELS <= 2
-static inline l1_pgentry_t gva_to_gpte(unsigned long gva)
-{
-    l2_pgentry_t gpde;
-    l1_pgentry_t gpte;
-    struct vcpu *v = current;
-
-    ASSERT( shadow_mode_translate(current->domain) );
-
-    __guest_get_l2e(v, gva, &gpde);
-    if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) )
-        return l1e_empty();;
-
-    // This is actually overkill - we only need to make sure the hl2
-    // is in-sync.
-    //
-    shadow_sync_va(v, gva);
-
-    if ( unlikely(__copy_from_user(&gpte,
-                                   &linear_pg_table[gva >> PAGE_SHIFT],
-                                   sizeof(gpte))) )
-    {
-        FSH_LOG("gva_to_gpte got a fault on gva=%lx", gva);
-        return l1e_empty();
-    }
-
-    return gpte;
-}
-
-static inline unsigned long gva_to_gpa(unsigned long gva)
-{
-    l1_pgentry_t gpte;
-
-    gpte = gva_to_gpte(gva);
-    if ( !(l1e_get_flags(gpte) & _PAGE_PRESENT) )
-        return 0;
-
-    return l1e_get_paddr(gpte) + (gva & ~PAGE_MASK);
-}
-#endif
-
-static inline unsigned long gva_to_mfn(unsigned long gva)
-{
-    unsigned long gpa = gva_to_gpa(gva);
-    return get_mfn_from_gpfn(gpa >> PAGE_SHIFT);
-}
-
-/************************************************************************/
-
-extern void __update_pagetables(struct vcpu *v);
-static inline void update_pagetables(struct vcpu *v)
-{
-    struct domain *d = v->domain;
-    int paging_enabled;
-
-    if ( hvm_guest(v) )
-        paging_enabled = hvm_paging_enabled(v);
-    else
-        // HACK ALERT: there's currently no easy way to figure out if a domU
-        // has set its arch.guest_table to zero, vs not yet initialized it.
-        //
-        paging_enabled = !!pagetable_get_paddr(v->arch.guest_table);
-
-    /*
-     * We don't call __update_pagetables() when hvm guest paging is
-     * disabled as we want the linear_pg_table to be inaccessible so that
-     * we bail out early of shadow_fault() if the hvm guest tries illegal
-     * accesses while it thinks paging is turned off.
-     */
-    if ( unlikely(shadow_mode_enabled(d)) && paging_enabled )
-    {
-        shadow_lock(d);
-        __update_pagetables(v);
-        shadow_unlock(d);
-    }
-
-    if ( likely(!shadow_mode_external(d)) )
-    {
-        if ( shadow_mode_enabled(d) )
-            v->arch.monitor_table = v->arch.shadow_table;
-        else
-#if CONFIG_PAGING_LEVELS == 4
-        if ( !(v->arch.flags & TF_kernel_mode) )
-            v->arch.monitor_table = v->arch.guest_table_user;
-        else
-#endif
-            v->arch.monitor_table = v->arch.guest_table;
-    }
-}
-
-void clear_all_shadow_status(struct domain *d);
-
-#if SHADOW_DEBUG
-extern int _check_pagetable(struct vcpu *v, char *s);
-extern int _check_all_pagetables(struct vcpu *v, char *s);
-
-#define check_pagetable(_v, _s) _check_pagetable(_v, _s)
-//#define check_pagetable(_v, _s) _check_all_pagetables(_v, _s)
-
-#else
-#define check_pagetable(_v, _s) ((void)0)
-#endif
-
-#endif /* XEN_SHADOW_H */
+#endif /* _XEN_SHADOW_H */
 
 /*
  * Local variables:
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/x86_32/page-2level.h
--- a/xen/include/asm-x86/x86_32/page-2level.h  Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/x86_32/page-2level.h  Wed Aug 16 17:02:35 2006 +0100
@@ -46,6 +46,7 @@ typedef l2_pgentry_t root_pgentry_t;
  *  12-bit flags = (pte[11:0])
  */
 
+#define _PAGE_NX_BIT            0U
 #define _PAGE_NX                0U
 
 /* Extract flags into 12-bit integer, or turn 12-bit flags into a pte mask. */
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/x86_32/page-3level.h
--- a/xen/include/asm-x86/x86_32/page-3level.h  Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/x86_32/page-3level.h  Wed Aug 16 17:02:35 2006 +0100
@@ -59,7 +59,8 @@ typedef l3_pgentry_t root_pgentry_t;
  *  32-bit flags = (pte[63:44],pte[11:0])
  */
 
-#define _PAGE_NX (cpu_has_nx ? (1<<31) : 0)
+#define _PAGE_NX_BIT (1U<<31)
+#define _PAGE_NX     (cpu_has_nx ? _PAGE_NX_BIT : 0)
 
 /* Extract flags into 32-bit integer, or turn 32-bit flags into a pte mask. */
 #define get_pte_flags(x) (((int)((x) >> 32) & ~0xFFF) | ((int)(x) & 0xFFF))
diff -r fda70200da01 -r 0f917d63e960 xen/include/asm-x86/x86_64/page.h
--- a/xen/include/asm-x86/x86_64/page.h Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/asm-x86/x86_64/page.h Wed Aug 16 17:02:35 2006 +0100
@@ -44,6 +44,8 @@ typedef l4_pgentry_t root_pgentry_t;
 /* Given a virtual address, get an entry offset into a linear page table. */
 #define l1_linear_offset(_a) (((_a) & VADDR_MASK) >> L1_PAGETABLE_SHIFT)
 #define l2_linear_offset(_a) (((_a) & VADDR_MASK) >> L2_PAGETABLE_SHIFT)
+#define l3_linear_offset(_a) (((_a) & VADDR_MASK) >> L3_PAGETABLE_SHIFT)
+#define l4_linear_offset(_a) (((_a) & VADDR_MASK) >> L4_PAGETABLE_SHIFT)
 
 #define is_guest_l1_slot(_s) (1)
 #define is_guest_l2_slot(_t, _s) (1)
@@ -70,7 +72,8 @@ typedef l4_pgentry_t root_pgentry_t;
 #define put_pte_flags(x) (((intpte_t)((x) & ~0xFFF) << 40) | ((x) & 0xFFF))
 
 /* Bit 23 of a 24-bit flag mask. This corresponds to bit 63 of a pte.*/
-#define _PAGE_NX (cpu_has_nx ? (1U<<23) : 0U)
+#define _PAGE_NX_BIT (1U<<23)
+#define _PAGE_NX     (cpu_has_nx ? _PAGE_NX_BIT : 0U)
 
 #define L1_DISALLOW_MASK BASE_DISALLOW_MASK
 #define L2_DISALLOW_MASK BASE_DISALLOW_MASK
diff -r fda70200da01 -r 0f917d63e960 xen/include/public/dom0_ops.h
--- a/xen/include/public/dom0_ops.h     Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/public/dom0_ops.h     Wed Aug 16 17:02:35 2006 +0100
@@ -262,6 +262,18 @@ DEFINE_XEN_GUEST_HANDLE(dom0_sched_id_t)
 #define DOM0_SHADOW_CONTROL_OP_CLEAN       11
 #define DOM0_SHADOW_CONTROL_OP_PEEK        12
 
+/* Shadow2 operations */
+#define DOM0_SHADOW2_CONTROL_OP_GET_ALLOCATION   30
+#define DOM0_SHADOW2_CONTROL_OP_SET_ALLOCATION   31
+#define DOM0_SHADOW2_CONTROL_OP_ENABLE           32
+
+/* Mode flags for Shadow2 enable op */
+#define DOM0_SHADOW2_CONTROL_FLAG_ENABLE    (1 << 0)
+#define DOM0_SHADOW2_CONTROL_FLAG_REFCOUNT  (1 << 1)
+#define DOM0_SHADOW2_CONTROL_FLAG_LOG_DIRTY (1 << 2)
+#define DOM0_SHADOW2_CONTROL_FLAG_TRANSLATE (1 << 3)
+#define DOM0_SHADOW2_CONTROL_FLAG_EXTERNAL  (1 << 4)
+
 struct dom0_shadow_control_stats {
     uint32_t fault_count;
     uint32_t dirty_count;
@@ -277,7 +289,9 @@ struct dom0_shadow_control {
     uint32_t       op;
     XEN_GUEST_HANDLE(ulong) dirty_bitmap;
     /* IN/OUT variables. */
-    uint64_t       pages;        /* size of buffer, updated with actual size */
+    uint64_t       pages;    /* size of buffer, updated with actual size */
+    uint32_t       mb;       /* Shadow2 memory allocation in MB */
+    uint32_t       mode;     /* Shadow2 mode to enable */
     /* OUT variables. */
     struct dom0_shadow_control_stats stats;
 };
diff -r fda70200da01 -r 0f917d63e960 xen/include/xen/domain_page.h
--- a/xen/include/xen/domain_page.h     Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/xen/domain_page.h     Wed Aug 16 17:02:35 2006 +0100
@@ -25,6 +25,13 @@ extern void *map_domain_page(unsigned lo
  * currently-executing VCPU via a call to map_domain_pages().
  */
 extern void unmap_domain_page(void *va);
+
+/* 
+ * Convert a VA (within a page previously mapped in the context of the
+ * currently-executing VCPU via a call to map_domain_pages()) to a machine 
+ * address 
+ */
+extern paddr_t mapped_domain_page_to_maddr(void *va);
 
 /*
  * Similar to the above calls, except the mapping is accessible in all
@@ -98,6 +105,7 @@ domain_mmap_cache_destroy(struct domain_
 
 #define map_domain_page(pfn)                maddr_to_virt((pfn)<<PAGE_SHIFT)
 #define unmap_domain_page(va)               ((void)(va))
+#define mapped_domain_page_to_maddr(va)     (virt_to_maddr(va))
 
 #define map_domain_page_global(pfn)         maddr_to_virt((pfn)<<PAGE_SHIFT)
 #define unmap_domain_page_global(va)        ((void)(va))
@@ -112,4 +120,9 @@ struct domain_mmap_cache {
 
 #endif /* !CONFIG_DOMAIN_PAGE */
 
+#define HERE_I_AM \
+do { \
+    printk("HERE I AM: %s %s %d\n", __func__, __FILE__, __LINE__); \
+} while (0)
+
 #endif /* __XEN_DOMAIN_PAGE_H__ */
diff -r fda70200da01 -r 0f917d63e960 xen/include/xen/lib.h
--- a/xen/include/xen/lib.h     Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/xen/lib.h     Wed Aug 16 17:02:35 2006 +0100
@@ -18,7 +18,7 @@ extern void __bug(char *file, int line) 
 #ifndef NDEBUG
 #define ASSERT(_p)                                                      \
     do {                                                                \
-        if ( !(_p) )                                                    \
+        if ( unlikely(!(_p)) )                                          \
         {                                                               \
             printk("Assertion '%s' failed, line %d, file %s\n", #_p ,   \
                    __LINE__, __FILE__);                                 \
@@ -41,7 +41,7 @@ void cmdline_parse(char *cmdline);
 void cmdline_parse(char *cmdline);
 
 #ifndef NDEBUG
-extern int debugtrace_send_to_console;
+extern void debugtrace_toggle(void);
 extern void debugtrace_dump(void);
 extern void debugtrace_printk(const char *fmt, ...);
 #else
diff -r fda70200da01 -r 0f917d63e960 xen/include/xen/list.h
--- a/xen/include/xen/list.h    Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/xen/list.h    Wed Aug 16 17:02:35 2006 +0100
@@ -160,6 +160,16 @@ static __inline__ void list_splice(struc
 #define list_for_each_safe(pos, n, head) \
        for (pos = (head)->next, n = pos->next; pos != (head); \
                pos = n, n = pos->next)
+
+/**
+ * list_for_each_backwards_safe        -       iterate backwards over a list 
safe against removal of list entry
+ * @pos:       the &struct list_head to use as a loop counter.
+ * @n:         another &struct list_head to use as temporary storage
+ * @head:      the head for your list.
+ */
+#define list_for_each_backwards_safe(pos, n, head) \
+       for (pos = (head)->prev, n = pos->prev; pos != (head); \
+               pos = n, n = pos->prev)
 
 /**
  * list_for_each_entry -       iterate over list of given type
diff -r fda70200da01 -r 0f917d63e960 xen/include/xen/sched.h
--- a/xen/include/xen/sched.h   Wed Aug 16 16:16:32 2006 +0100
+++ b/xen/include/xen/sched.h   Wed Aug 16 17:02:35 2006 +0100
@@ -376,9 +376,12 @@ extern struct domain *domain_list;
  /* VCPU is paused by the hypervisor? */
 #define _VCPUF_paused          11
 #define VCPUF_paused           (1UL<<_VCPUF_paused)
- /* VCPU is blocked awaiting an event to be consumed by Xen. */
+/* VCPU is blocked awaiting an event to be consumed by Xen. */
 #define _VCPUF_blocked_in_xen  12
 #define VCPUF_blocked_in_xen   (1UL<<_VCPUF_blocked_in_xen)
+ /* HVM vcpu thinks CR0.PG == 0 */
+#define _VCPUF_shadow2_translate 13
+#define VCPUF_shadow2_translate  (1UL<<_VCPUF_shadow2_translate)
 
 /*
  * Per-domain flags (domain_flags).
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/shadow2-common.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/shadow2-common.c     Wed Aug 16 17:02:35 2006 +0100
@@ -0,0 +1,3394 @@
+/******************************************************************************
+ * arch/x86/shadow2-common.c
+ *
+ * Shadow2 code that does not need to be multiply compiled.
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#define SHADOW2 1
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/trace.h>
+#include <xen/sched.h>
+#include <xen/perfc.h>
+#include <xen/irq.h>
+#include <xen/domain_page.h>
+#include <xen/guest_access.h>
+#include <asm/event.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/flushtlb.h>
+#include <asm/shadow2.h>
+#include <asm/shadow2-private.h>
+
+#if SHADOW2_AUDIT
+int shadow2_audit_enable = 0;
+#endif
+
+static void sh2_free_log_dirty_bitmap(struct domain *d);
+
+int _shadow2_mode_refcounts(struct domain *d)
+{
+    return shadow2_mode_refcounts(d);
+}
+
+
+/**************************************************************************/
+/* x86 emulator support for the shadow2 code
+ */
+
+static int
+sh2_x86_emulate_read_std(unsigned long addr,
+                         unsigned long *val,
+                         unsigned int bytes,
+                         struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *v = current;
+    if ( hvm_guest(v) )
+    {
+        *val = 0;
+        // XXX -- this is WRONG.
+        //        It entirely ignores the permissions in the page tables.
+        //        In this case, that is only a user vs supervisor access check.
+        //
+        if ( hvm_copy(val, addr, bytes, HVM_COPY_IN) )
+        {
+#if 0
+            SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
+                           v->domain->domain_id, v->vcpu_id, 
+                           addr, *val, bytes);
+#endif
+            return X86EMUL_CONTINUE;
+        }
+
+        /* If we got here, there was nothing mapped here, or a bad GFN 
+         * was mapped here.  This should never happen: we're here because
+         * of a write fault at the end of the instruction we're emulating. */ 
+        SHADOW2_PRINTK("read failed to va %#lx\n", addr);
+        return X86EMUL_PROPAGATE_FAULT;
+    }
+    else 
+    {
+        SHADOW2_PRINTK("this operation is not emulated yet\n");
+        return X86EMUL_UNHANDLEABLE;
+    }
+}
+
+static int
+sh2_x86_emulate_write_std(unsigned long addr,
+                          unsigned long val,
+                          unsigned int bytes,
+                          struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *v = current;
+#if 0
+    SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
+                  v->domain->domain_id, v->vcpu_id, addr, val, bytes);
+#endif
+    if ( hvm_guest(v) )
+    {
+        // XXX -- this is WRONG.
+        //        It entirely ignores the permissions in the page tables.
+        //        In this case, that includes user vs supervisor, and
+        //        write access.
+        //
+        if ( hvm_copy(&val, addr, bytes, HVM_COPY_OUT) )
+            return X86EMUL_CONTINUE;
+
+        /* If we got here, there was nothing mapped here, or a bad GFN 
+         * was mapped here.  This should never happen: we're here because
+         * of a write fault at the end of the instruction we're emulating,
+         * which should be handled by sh2_x86_emulate_write_emulated. */ 
+        SHADOW2_PRINTK("write failed to va %#lx\n", addr);
+        return X86EMUL_PROPAGATE_FAULT;
+    }
+    else 
+    {
+        SHADOW2_PRINTK("this operation is not emulated yet\n");
+        return X86EMUL_UNHANDLEABLE;
+    }
+}
+
+static int
+sh2_x86_emulate_write_emulated(unsigned long addr,
+                               unsigned long val,
+                               unsigned int bytes,
+                               struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *v = current;
+#if 0
+    SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
+                  v->domain->domain_id, v->vcpu_id, addr, val, bytes);
+#endif
+    if ( hvm_guest(v) )
+    {
+        return v->arch.shadow2->x86_emulate_write(v, addr, &val, bytes, ctxt);
+    }
+    else 
+    {
+        SHADOW2_PRINTK("this operation is not emulated yet\n");
+        return X86EMUL_UNHANDLEABLE;
+    }
+}
+
+static int 
+sh2_x86_emulate_cmpxchg_emulated(unsigned long addr,
+                                 unsigned long old,
+                                 unsigned long new,
+                                 unsigned int bytes,
+                                 struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *v = current;
+#if 0
+    SHADOW2_PRINTK("d=%u v=%u a=%#lx o?=%#lx n:=%#lx bytes=%u\n",
+                   v->domain->domain_id, v->vcpu_id, addr, old, new, bytes);
+#endif
+    if ( hvm_guest(v) )
+    {
+        return v->arch.shadow2->x86_emulate_cmpxchg(v, addr, old, new, 
+                                                    bytes, ctxt);
+    }
+    else 
+    {
+        SHADOW2_PRINTK("this operation is not emulated yet\n");
+        return X86EMUL_UNHANDLEABLE;
+    }
+}
+
+static int 
+sh2_x86_emulate_cmpxchg8b_emulated(unsigned long addr,
+                                   unsigned long old_lo,
+                                   unsigned long old_hi,
+                                   unsigned long new_lo,
+                                   unsigned long new_hi,
+                                   struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *v = current;
+#if 0
+    SHADOW2_PRINTK("d=%u v=%u a=%#lx o?=%#lx:%lx n:=%#lx:%lx\n",
+                   v->domain->domain_id, v->vcpu_id, addr, old_hi, old_lo,
+                   new_hi, new_lo, ctxt);
+#endif
+    if ( hvm_guest(v) )
+    {
+        return v->arch.shadow2->x86_emulate_cmpxchg8b(v, addr, old_lo, old_hi,
+                                                      new_lo, new_hi, ctxt);
+    }
+    else 
+    {
+        SHADOW2_PRINTK("this operation is not emulated yet\n");
+        return X86EMUL_UNHANDLEABLE;
+    }
+}
+
+
+struct x86_emulate_ops shadow2_emulator_ops = {
+    .read_std           = sh2_x86_emulate_read_std,
+    .write_std          = sh2_x86_emulate_write_std,
+    .read_emulated      = sh2_x86_emulate_read_std,
+    .write_emulated     = sh2_x86_emulate_write_emulated,
+    .cmpxchg_emulated   = sh2_x86_emulate_cmpxchg_emulated,
+    .cmpxchg8b_emulated = sh2_x86_emulate_cmpxchg8b_emulated,
+};
+
+
+/**************************************************************************/
+/* Code for "promoting" a guest page to the point where the shadow code is
+ * willing to let it be treated as a guest page table.  This generally
+ * involves making sure there are no writable mappings available to the guest
+ * for this page.
+ */
+void shadow2_promote(struct vcpu *v, mfn_t gmfn, u32 type)
+{
+    struct page_info *page = mfn_to_page(gmfn);
+    unsigned long type_info;
+
+    ASSERT(valid_mfn(gmfn));
+
+    /* We should never try to promote a gmfn that has writeable mappings */
+    ASSERT(shadow2_remove_write_access(v, gmfn, 0, 0) == 0);
+
+    // Is the page already shadowed?
+    if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
+    {
+        // No prior shadow exists...
+
+        // Grab a type-ref.  We don't really care if we are racing with another
+        // vcpu or not, or even what kind of type we get; we just want the type
+        // count to be > 0.
+        //
+        do {
+            type_info =
+                page->u.inuse.type_info & (PGT_type_mask | PGT_va_mask);
+        } while ( !get_page_type(page, type_info) );
+
+        // Now that the type ref is non-zero, we can safely use the
+        // shadow2_flags.
+        //
+        page->shadow2_flags = 0;
+    }
+
+    ASSERT(!test_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags));
+    set_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags);
+}
+
+void shadow2_demote(struct vcpu *v, mfn_t gmfn, u32 type)
+{
+    struct page_info *page = mfn_to_page(gmfn);
+
+    ASSERT(test_bit(_PGC_page_table, &page->count_info));
+    ASSERT(test_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags));
+
+    clear_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags);
+
+    if ( (page->shadow2_flags & SH2F_page_type_mask) == 0 )
+    {
+        // release the extra type ref
+        put_page_type(page);
+
+        // clear the is-a-page-table bit.
+        clear_bit(_PGC_page_table, &page->count_info);
+    }
+}
+
+/**************************************************************************/
+/* Validate a pagetable change from the guest and update the shadows.
+ * Returns a bitmask of SHADOW2_SET_* flags. */
+
+static int
+__shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn, 
+                               void *entry, u32 size)
+{
+    int result = 0;
+    struct page_info *page = mfn_to_page(gmfn);
+
+    sh2_mark_dirty(v->domain, gmfn);
+    
+    // Determine which types of shadows are affected, and update each.
+    //
+    // Always validate L1s before L2s to prevent another cpu with a linear
+    // mapping of this gmfn from seeing a walk that results from 
+    // using the new L2 value and the old L1 value.  (It is OK for such a
+    // guest to see a walk that uses the old L2 value with the new L1 value,
+    // as hardware could behave this way if one level of the pagewalk occurs
+    // before the store, and the next level of the pagewalk occurs after the
+    // store.
+    //
+    // Ditto for L2s before L3s, etc.
+    //
+
+    if ( !(page->count_info & PGC_page_table) )
+        return 0;  /* Not shadowed at all */
+
+#if CONFIG_PAGING_LEVELS == 2
+    if ( page->shadow2_flags & SH2F_L1_32 ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 2, 2)
+            (v, gmfn, entry, size);
+#else 
+    if ( page->shadow2_flags & SH2F_L1_32 ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 3, 2)
+            (v, gmfn, entry, size);
+#endif
+
+#if CONFIG_PAGING_LEVELS == 2
+    if ( page->shadow2_flags & SH2F_L2_32 ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 2, 2)
+            (v, gmfn, entry, size);
+#else 
+    if ( page->shadow2_flags & SH2F_L2_32 ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 3, 2)
+            (v, gmfn, entry, size);
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 3 
+    if ( page->shadow2_flags & SH2F_L1_PAE ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 3, 3)
+            (v, gmfn, entry, size);
+    if ( page->shadow2_flags & SH2F_L2_PAE ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 3, 3)
+            (v, gmfn, entry, size);
+    if ( page->shadow2_flags & SH2F_L2H_PAE ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2he, 3, 3)
+            (v, gmfn, entry, size);
+    if ( page->shadow2_flags & SH2F_L3_PAE ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, 3, 3)
+            (v, gmfn, entry, size);
+#else /* 32-bit non-PAE hypervisor does not support PAE guests */
+    ASSERT((page->shadow2_flags & (SH2F_L3_PAE|SH2F_L2_PAE|SH2F_L1_PAE)) == 0);
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 4 
+    if ( page->shadow2_flags & SH2F_L1_64 ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 4, 4)
+            (v, gmfn, entry, size);
+    if ( page->shadow2_flags & SH2F_L2_64 ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 4, 4)
+            (v, gmfn, entry, size);
+    if ( page->shadow2_flags & SH2F_L3_64 ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, 4, 4)
+            (v, gmfn, entry, size);
+    if ( page->shadow2_flags & SH2F_L4_64 ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl4e, 4, 4)
+            (v, gmfn, entry, size);
+#else /* 32-bit/PAE hypervisor does not support 64-bit guests */
+    ASSERT((page->shadow2_flags 
+            & (SH2F_L4_64|SH2F_L3_64|SH2F_L2_64|SH2F_L1_64)) == 0);
+#endif
+
+    return result;
+}
+
+
+int
+shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry)
+/* This is the entry point from hypercalls. It returns a bitmask of all the 
+ * results of shadow_set_l*e() calls, so the caller knows to do TLB flushes. */
+{
+    int rc;
+
+    ASSERT(shadow2_lock_is_acquired(v->domain));
+    rc = __shadow2_validate_guest_entry(v, gmfn, entry, sizeof(l1_pgentry_t));
+    shadow2_audit_tables(v);
+    return rc;
+}
+
+void
+shadow2_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
+                                void *entry, u32 size)
+/* This is the entry point for emulated writes to pagetables in HVM guests */
+{
+    struct domain *d = v->domain;
+    int rc;
+
+    ASSERT(shadow2_lock_is_acquired(v->domain));
+    rc = __shadow2_validate_guest_entry(v, gmfn, entry, size);
+    if ( rc & SHADOW2_SET_FLUSH )
+    {
+        // Flush everyone except the local processor, which will flush when it
+        // re-enters the HVM guest.
+        //
+        cpumask_t mask = d->domain_dirty_cpumask;
+        cpu_clear(v->processor, mask);
+        flush_tlb_mask(mask);
+    }
+    if ( rc & SHADOW2_SET_ERROR ) 
+    {
+        /* This page is probably not a pagetable any more: tear it out of the 
+         * shadows, along with any tables that reference it */
+        shadow2_remove_all_shadows_and_parents(v, gmfn);
+    }
+    /* We ignore the other bits: since we are about to change CR3 on
+     * VMENTER we don't need to do any extra TLB flushes. */ 
+}
+
+
+/**************************************************************************/
+/* Memory management for shadow pages. */ 
+
+/* Meaning of the count_info field in shadow pages
+ * ----------------------------------------------
+ * 
+ * A count of all references to this page from other shadow pages and
+ * guest CR3s (a.k.a. v->arch.shadow_table).  
+ *
+ * The top bits hold the shadow type and the pinned bit.  Top-level
+ * shadows are pinned so that they don't disappear when not in a CR3
+ * somewhere.
+ *
+ * We don't need to use get|put_page for this as the updates are all
+ * protected by the shadow lock.  We can't use get|put_page for this
+ * as the size of the count on shadow pages is different from that on
+ * normal guest pages.
+ */
+
+/* Meaning of the type_info field in shadow pages
+ * ----------------------------------------------
+ * 
+ * type_info use depends on the shadow type (from count_info)
+ * 
+ * PGC_SH2_none : This page is in the shadow2 free pool.  type_info holds
+ *                the chunk order for our freelist allocator.
+ *
+ * PGC_SH2_l*_shadow : This page is in use as a shadow. type_info 
+ *                     holds the mfn of the guest page being shadowed,
+ *
+ * PGC_SH2_fl1_*_shadow : This page is being used to shatter a superpage.
+ *                        type_info holds the gfn being shattered.
+ *
+ * PGC_SH2_monitor_table : This page is part of a monitor table.
+ *                         type_info is not used.
+ */
+
+/* Meaning of the _domain field in shadow pages
+ * --------------------------------------------
+ *
+ * In shadow pages, this field will always have its least significant bit
+ * set.  This ensures that all attempts to get_page() will fail (as all
+ * valid pickled domain pointers have a zero for their least significant bit).
+ * Instead, the remaining upper bits are used to record the shadow generation
+ * counter when the shadow was created.
+ */
+
+/* Meaning of the shadow2_flags field
+ * ----------------------------------
+ * 
+ * In guest pages that are shadowed, one bit for each kind of shadow they have.
+ * 
+ * In shadow pages, will be used for holding a representation of the populated
+ * entries in this shadow (either a min/max, or a bitmap, or ...)
+ *
+ * In monitor-table pages, holds the level of the particular page (to save
+ * spilling the shadow types into an extra bit by having three types of monitor
+ * page).
+ */
+
+/* Meaning of the list_head struct in shadow pages
+ * -----------------------------------------------
+ *
+ * In free shadow pages, this is used to hold the free-lists of chunks.
+ *
+ * In top-level shadow tables, this holds a linked-list of all top-level
+ * shadows (used for recovering memory and destroying shadows). 
+ *
+ * In lower-level shadows, this holds the physical address of a higher-level
+ * shadow entry that holds a reference to this shadow (or zero).
+ */
+
+/* Allocating shadow pages
+ * -----------------------
+ *
+ * Most shadow pages are allocated singly, but there are two cases where we 
+ * need to allocate multiple pages together.
+ * 
+ * 1: Shadowing 32-bit guest tables on PAE or 64-bit shadows.
+ *    A 32-bit guest l1 table covers 4MB of virtuial address space,
+ *    and needs to be shadowed by two PAE/64-bit l1 tables (covering 2MB
+ *    of virtual address space each).  Similarly, a 32-bit guest l2 table 
+ *    (4GB va) needs to be shadowed by four PAE/64-bit l2 tables (1GB va 
+ *    each).  These multi-page shadows are contiguous and aligned; 
+ *    functions for handling offsets into them are defined in shadow2.c 
+ *    (shadow_l1_index() etc.)
+ *    
+ * 2: Shadowing PAE top-level pages.  Each guest page that contains
+ *    any PAE top-level pages requires two shadow pages to shadow it.
+ *    They contain alternating l3 tables and pae_l3_bookkeeping structs.
+ *
+ * This table shows the allocation behaviour of the different modes:
+ *
+ * Xen paging      32b  pae  pae  64b  64b  64b
+ * Guest paging    32b  32b  pae  32b  pae  64b
+ * PV or HVM        *   HVM   *   HVM  HVM   * 
+ * Shadow paging   32b  pae  pae  pae  pae  64b
+ *
+ * sl1 size         4k   8k   4k   8k   4k   4k
+ * sl2 size         4k  16k   4k  16k   4k   4k
+ * sl3 size         -    -    8k   -    8k   4k
+ * sl4 size         -    -    -    -    -    4k
+ *
+ * We allocate memory from xen in four-page units and break them down
+ * with a simple buddy allocator.  Can't use the xen allocator to handle
+ * this as it only works for contiguous zones, and a domain's shadow
+ * pool is made of fragments.
+ *
+ * In HVM guests, the p2m table is built out of shadow pages, and we provide 
+ * a function for the p2m management to steal pages, in max-order chunks, from 
+ * the free pool.  We don't provide for giving them back, yet.
+ */
+
+/* Figure out the least acceptable quantity of shadow memory.
+ * The minimum memory requirement for always being able to free up a
+ * chunk of memory is very small -- only three max-order chunks per
+ * vcpu to hold the top level shadows and pages with Xen mappings in them.  
+ *
+ * But for a guest to be guaranteed to successfully execute a single
+ * instruction, we must be able to map a large number (about thirty) VAs
+ * at the same time, which means that to guarantee progress, we must
+ * allow for more than ninety allocated pages per vcpu.  We round that
+ * up to 128 pages, or half a megabyte per vcpu. */
+unsigned int shadow2_min_acceptable_pages(struct domain *d) 
+{
+    u32 vcpu_count = 0;
+    struct vcpu *v;
+
+    for_each_vcpu(d, v)
+        vcpu_count++;
+
+    return (vcpu_count * 128);
+}
+
+/* Using the type_info field to store freelist order */
+#define SH2_PFN_ORDER(_p) ((_p)->u.inuse.type_info)
+#define SH2_SET_PFN_ORDER(_p, _o)                       \
+ do { (_p)->u.inuse.type_info = (_o); } while (0)
+ 
+
+/* Figure out the order of allocation needed for a given shadow type */
+static inline u32
+shadow_order(u32 shadow_type) 
+{
+#if CONFIG_PAGING_LEVELS > 2
+    static const u32 type_to_order[16] = {
+        0, /* PGC_SH2_none           */
+        1, /* PGC_SH2_l1_32_shadow   */
+        1, /* PGC_SH2_fl1_32_shadow  */
+        2, /* PGC_SH2_l2_32_shadow   */
+        0, /* PGC_SH2_l1_pae_shadow  */
+        0, /* PGC_SH2_fl1_pae_shadow */
+        0, /* PGC_SH2_l2_pae_shadow  */
+        0, /* PGC_SH2_l2h_pae_shadow */
+        1, /* PGC_SH2_l3_pae_shadow  */
+        0, /* PGC_SH2_l1_64_shadow   */
+        0, /* PGC_SH2_fl1_64_shadow  */
+        0, /* PGC_SH2_l2_64_shadow   */
+        0, /* PGC_SH2_l3_64_shadow   */
+        0, /* PGC_SH2_l4_64_shadow   */
+        2, /* PGC_SH2_p2m_table      */
+        0  /* PGC_SH2_monitor_table  */
+        };
+    u32 type = (shadow_type & PGC_SH2_type_mask) >> PGC_SH2_type_shift;
+    return type_to_order[type];
+#else  /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */
+    return 0;
+#endif
+}
+
+
+/* Do we have a free chunk of at least this order? */
+static inline int chunk_is_available(struct domain *d, int order)
+{
+    int i;
+    
+    for ( i = order; i <= SHADOW2_MAX_ORDER; i++ )
+        if ( !list_empty(&d->arch.shadow2_freelists[i]) )
+            return 1;
+    return 0;
+}
+
+/* Dispatcher function: call the per-mode function that will unhook the
+ * non-Xen mappings in this top-level shadow mfn */
+void shadow2_unhook_mappings(struct vcpu *v, mfn_t smfn)
+{
+    struct page_info *pg = mfn_to_page(smfn);
+    switch ( (pg->count_info & PGC_SH2_type_mask) >> PGC_SH2_type_shift )
+    {
+    case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift:
+#if CONFIG_PAGING_LEVELS == 2
+        SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings,2,2)(v,smfn);
+#else
+        SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings,3,2)(v,smfn);
+#endif
+        break;
+#if CONFIG_PAGING_LEVELS >= 3
+    case PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_unhook_pae_mappings,3,3)(v,smfn);
+        break;
+#endif
+#if CONFIG_PAGING_LEVELS >= 4
+    case PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_unhook_64b_mappings,4,4)(v,smfn);
+        break;
+#endif
+    default:
+        SHADOW2_PRINTK("top-level shadow has bad type %08lx\n", 
+                       (unsigned long)((pg->count_info & PGC_SH2_type_mask)
+                                       >> PGC_SH2_type_shift));
+        BUG();
+    }
+}
+
+
+/* Make sure there is at least one chunk of the required order available
+ * in the shadow page pool. This must be called before any calls to
+ * shadow2_alloc().  Since this will free existing shadows to make room,
+ * it must be called early enough to avoid freeing shadows that the
+ * caller is currently working on. */
+void shadow2_prealloc(struct domain *d, unsigned int order)
+{
+    /* Need a vpcu for calling unpins; for now, since we don't have
+     * per-vcpu shadows, any will do */
+    struct vcpu *v = d->vcpu[0];
+    struct list_head *l, *t;
+    struct page_info *pg;
+    mfn_t smfn;
+
+    if ( chunk_is_available(d, order) ) return; 
+    
+    /* Stage one: walk the list of top-level pages, unpinning them */
+    perfc_incrc(shadow2_prealloc_1);
+    list_for_each_backwards_safe(l, t, &d->arch.shadow2_toplevel_shadows)
+    {
+        pg = list_entry(l, struct page_info, list);
+        smfn = page_to_mfn(pg);
+
+#if CONFIG_PAGING_LEVELS >= 3
+        if ( (pg->count_info & PGC_SH2_type_mask) == PGC_SH2_l3_pae_shadow )
+        {
+            /* For PAE, we need to unpin each subshadow on this shadow */
+            SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows,3,3)(v, smfn);
+        } 
+        else 
+#endif /* 32-bit code always takes this branch */
+        {
+            /* Unpin this top-level shadow */
+            sh2_unpin(v, smfn);
+        }
+
+        /* See if that freed up a chunk of appropriate size */
+        if ( chunk_is_available(d, order) ) return;
+    }
+
+    /* Stage two: all shadow pages are in use in hierarchies that are
+     * loaded in cr3 on some vcpu.  Walk them, unhooking the non-Xen
+     * mappings. */
+    perfc_incrc(shadow2_prealloc_2);
+    v = current;
+    if ( v->domain != d )
+        v = d->vcpu[0];
+    /* Walk the list from the tail: recently used toplevels have been pulled
+     * to the head */
+    list_for_each_backwards_safe(l, t, &d->arch.shadow2_toplevel_shadows)
+    {
+        pg = list_entry(l, struct page_info, list);
+        smfn = page_to_mfn(pg);
+        shadow2_unhook_mappings(v, smfn);
+
+        /* Need to flush TLB if we've altered our own tables */
+        if ( !shadow2_mode_external(d) 
+             && pagetable_get_pfn(current->arch.shadow_table) == mfn_x(smfn) )
+            local_flush_tlb();
+        
+        /* See if that freed up a chunk of appropriate size */
+        if ( chunk_is_available(d, order) ) return;
+    }
+    
+    /* Nothing more we can do: all remaining shadows are of pages that
+     * hold Xen mappings for some vcpu.  This can never happen. */
+    SHADOW2_PRINTK("Can't pre-allocate %i shadow pages!\n"
+                   "  shadow pages total = %u, free = %u, p2m=%u\n",
+                   1 << order, 
+                   d->arch.shadow2_total_pages, 
+                   d->arch.shadow2_free_pages, 
+                   d->arch.shadow2_p2m_pages);
+    BUG();
+}
+
+
+/* Allocate another shadow's worth of (contiguous, aligned) pages,
+ * and fill in the type and backpointer fields of their page_infos. 
+ * Never fails to allocate. */
+mfn_t shadow2_alloc(struct domain *d,  
+                    u32 shadow_type,
+                    unsigned long backpointer)
+{
+    struct page_info *pg = NULL;
+    unsigned int order = shadow_order(shadow_type);
+    cpumask_t mask;
+    void *p;
+    int i;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(order <= SHADOW2_MAX_ORDER);
+    ASSERT(shadow_type != PGC_SH2_none);
+    perfc_incrc(shadow2_alloc);
+
+    /* Find smallest order which can satisfy the request. */
+    for ( i = order; i <= SHADOW2_MAX_ORDER; i++ )
+        if ( !list_empty(&d->arch.shadow2_freelists[i]) )
+        {
+            pg = list_entry(d->arch.shadow2_freelists[i].next, 
+                            struct page_info, list);
+            list_del(&pg->list);
+            
+            /* We may have to halve the chunk a number of times. */
+            while ( i != order )
+            {
+                i--;
+                SH2_SET_PFN_ORDER(pg, i);
+                list_add_tail(&pg->list, &d->arch.shadow2_freelists[i]);
+                pg += 1 << i;
+            }
+            d->arch.shadow2_free_pages -= 1 << order;
+
+            /* Init page info fields and clear the pages */
+            for ( i = 0; i < 1<<order ; i++ ) 
+            {
+                pg[i].u.inuse.type_info = backpointer;
+                pg[i].count_info = shadow_type;
+                pg[i].shadow2_flags = 0;
+                INIT_LIST_HEAD(&pg[i].list);
+                /* Before we overwrite the old contents of this page, 
+                 * we need to be sure that no TLB holds a pointer to it. */
+                mask = d->domain_dirty_cpumask;
+                tlbflush_filter(mask, pg[i].tlbflush_timestamp);
+                if ( unlikely(!cpus_empty(mask)) )
+                {
+                    perfc_incrc(shadow2_alloc_tlbflush);
+                    flush_tlb_mask(mask);
+                }
+                /* Now safe to clear the page for reuse */
+                p = sh2_map_domain_page(page_to_mfn(pg+i));
+                ASSERT(p != NULL);
+                clear_page(p);
+                sh2_unmap_domain_page(p);
+                perfc_incr(shadow2_alloc_count);
+            }
+            return page_to_mfn(pg);
+        }
+    
+    /* If we get here, we failed to allocate. This should never happen.
+     * It means that we didn't call shadow2_prealloc() correctly before
+     * we allocated.  We can't recover by calling prealloc here, because
+     * we might free up higher-level pages that the caller is working on. */
+    SHADOW2_PRINTK("Can't allocate %i shadow pages!\n", 1 << order);
+    BUG();
+}
+
+
+/* Return some shadow pages to the pool. */
+void shadow2_free(struct domain *d, mfn_t smfn)
+{
+    struct page_info *pg = mfn_to_page(smfn); 
+    u32 shadow_type;
+    unsigned long order;
+    unsigned long mask;
+    int i;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    perfc_incrc(shadow2_free);
+
+    shadow_type = pg->count_info & PGC_SH2_type_mask;
+    ASSERT(shadow_type != PGC_SH2_none);
+    ASSERT(shadow_type != PGC_SH2_p2m_table);
+    order = shadow_order(shadow_type);
+
+    d->arch.shadow2_free_pages += 1 << order;
+
+    for ( i = 0; i < 1<<order; i++ ) 
+    {
+        /* Strip out the type: this is now a free shadow page */
+        pg[i].count_info = 0;
+        /* Remember the TLB timestamp so we will know whether to flush 
+         * TLBs when we reuse the page.  Because the destructors leave the
+         * contents of the pages in place, we can delay TLB flushes until
+         * just before the allocator hands the page out again. */
+        pg[i].tlbflush_timestamp = tlbflush_current_time();
+        perfc_decr(shadow2_alloc_count);
+    }
+
+    /* Merge chunks as far as possible. */
+    while ( order < SHADOW2_MAX_ORDER )
+    {
+        mask = 1 << order;
+        if ( (mfn_x(page_to_mfn(pg)) & mask) ) {
+            /* Merge with predecessor block? */
+            if ( (((pg-mask)->count_info & PGC_SH2_type_mask) != PGT_none) 
+                 || (SH2_PFN_ORDER(pg-mask) != order) )
+                break;
+            list_del(&(pg-mask)->list);
+            pg -= mask;
+        } else {
+            /* Merge with successor block? */
+            if ( (((pg+mask)->count_info & PGC_SH2_type_mask) != PGT_none)
+                 || (SH2_PFN_ORDER(pg+mask) != order) )
+                break;
+            list_del(&(pg+mask)->list);
+        }
+        order++;
+    }
+
+    SH2_SET_PFN_ORDER(pg, order);
+    list_add_tail(&pg->list, &d->arch.shadow2_freelists[order]);
+}
+
+/* Divert some memory from the pool to be used by the p2m mapping.
+ * This action is irreversible: the p2m mapping only ever grows.
+ * That's OK because the p2m table only exists for external domains,
+ * and those domains can't ever turn off shadow mode.
+ * Also, we only ever allocate a max-order chunk, so as to preserve
+ * the invariant that shadow2_prealloc() always works.
+ * Returns 0 iff it can't get a chunk (the caller should then
+ * free up some pages in domheap and call set_sh2_allocation);
+ * returns non-zero on success.
+ */
+static int
+shadow2_alloc_p2m_pages(struct domain *d)
+{
+    struct page_info *pg;
+    u32 i;
+    ASSERT(shadow2_lock_is_acquired(d));
+    
+    if ( d->arch.shadow2_total_pages 
+         < (shadow2_min_acceptable_pages(d) + (1<<SHADOW2_MAX_ORDER)) )
+        return 0; /* Not enough shadow memory: need to increase it first */
+    
+    pg = mfn_to_page(shadow2_alloc(d, PGC_SH2_p2m_table, 0));
+    d->arch.shadow2_p2m_pages += (1<<SHADOW2_MAX_ORDER);
+    d->arch.shadow2_total_pages -= (1<<SHADOW2_MAX_ORDER);
+    for (i = 0; i < (1<<SHADOW2_MAX_ORDER); i++)
+    {
+        /* Unlike shadow pages, mark p2m pages as owned by the domain */
+        page_set_owner(&pg[i], d);
+        list_add_tail(&pg[i].list, &d->arch.shadow2_p2m_freelist);
+    }
+    return 1;
+}
+
+// Returns 0 if no memory is available...
+mfn_t
+shadow2_alloc_p2m_page(struct domain *d)
+{
+    struct list_head *entry;
+    mfn_t mfn;
+    void *p;
+
+    if ( list_empty(&d->arch.shadow2_p2m_freelist) &&
+         !shadow2_alloc_p2m_pages(d) )
+        return _mfn(0);
+    entry = d->arch.shadow2_p2m_freelist.next;
+    list_del(entry);
+    list_add_tail(entry, &d->arch.shadow2_p2m_inuse);
+    mfn = page_to_mfn(list_entry(entry, struct page_info, list));
+    sh2_get_ref(mfn, 0);
+    p = sh2_map_domain_page(mfn);
+    clear_page(p);
+    sh2_unmap_domain_page(p);
+
+    return mfn;
+}
+
+#if CONFIG_PAGING_LEVELS == 3
+static void p2m_install_entry_in_monitors(struct domain *d, 
+                                          l3_pgentry_t *l3e) 
+/* Special case, only used for external-mode domains on PAE hosts:
+ * update the mapping of the p2m table.  Once again, this is trivial in
+ * other paging modes (one top-level entry points to the top-level p2m,
+ * no maintenance needed), but PAE makes life difficult by needing a
+ * copy the eight l3es of the p2m table in eight l2h slots in the
+ * monitor table.  This function makes fresh copies when a p2m l3e
+ * changes. */
+{
+    l2_pgentry_t *ml2e;
+    struct vcpu *v;
+    unsigned int index;
+
+    index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
+    ASSERT(index < MACHPHYS_MBYTES>>1);
+
+    for_each_vcpu(d, v) 
+    {
+        if ( pagetable_get_pfn(v->arch.monitor_table) == 0 ) 
+            continue;
+        ASSERT(shadow2_mode_external(v->domain));
+
+        SHADOW2_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
+                      d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
+
+        if ( v == current ) /* OK to use linear map of monitor_table */
+            ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
+        else 
+        {
+            l3_pgentry_t *ml3e;
+            ml3e = 
sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
+            ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
+            ml2e = sh2_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
+            ml2e += l2_table_offset(RO_MPT_VIRT_START);
+            sh2_unmap_domain_page(ml3e);
+        }
+        ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
+        if ( v != current )
+            sh2_unmap_domain_page(ml2e);
+    }
+}
+#endif
+
+// Find the next level's P2M entry, checking for out-of-range gfn's...
+// Returns NULL on error.
+//
+static l1_pgentry_t *
+p2m_find_entry(void *table, unsigned long *gfn_remainder,
+                   unsigned long gfn, u32 shift, u32 max)
+{
+    u32 index;
+
+    index = *gfn_remainder >> shift;
+    if ( index >= max )
+    {
+        SHADOW2_DEBUG(P2M, "gfn=0x%lx out of range "
+                      "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
+                       gfn, *gfn_remainder, shift, index, max);
+        return NULL;
+    }
+    *gfn_remainder &= (1 << shift) - 1;
+    return (l1_pgentry_t *)table + index;
+}
+
+// Walk one level of the P2M table, allocating a new table if required.
+// Returns 0 on error.
+//
+static int
+p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table, 
+               unsigned long *gfn_remainder, unsigned long gfn, u32 shift, 
+               u32 max, unsigned long type)
+{
+    l1_pgentry_t *p2m_entry;
+    void *next;
+
+    if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
+                                      shift, max)) )
+        return 0;
+
+    if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
+    {
+        mfn_t mfn = shadow2_alloc_p2m_page(d);
+        if ( mfn_x(mfn) == 0 )
+            return 0;
+        *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
+        mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated;
+        mfn_to_page(mfn)->count_info = 1;
+#if CONFIG_PAGING_LEVELS == 3
+        if (type == PGT_l2_page_table)
+        {
+            /* We have written to the p2m l3: need to sync the per-vcpu
+             * copies of it in the monitor tables */
+            p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry);
+        }
+#endif
+        /* The P2M can be shadowed: keep the shadows synced */
+        if ( d->vcpu[0] )
+            (void)__shadow2_validate_guest_entry(d->vcpu[0], *table_mfn,
+                                                 p2m_entry, sizeof *p2m_entry);
+    }
+    *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
+    next = sh2_map_domain_page(*table_mfn);
+    sh2_unmap_domain_page(*table);
+    *table = next;
+
+    return 1;
+}
+
+// Returns 0 on error (out of memory)
+int
+shadow2_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
+{
+    // XXX -- this might be able to be faster iff current->domain == d
+    mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
+    void *table = sh2_map_domain_page(table_mfn);
+    unsigned long gfn_remainder = gfn;
+    l1_pgentry_t *p2m_entry;
+
+#if CONFIG_PAGING_LEVELS >= 4
+    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+                         L4_PAGETABLE_SHIFT - PAGE_SHIFT,
+                         L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
+        return 0;
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+    // When using PAE Xen, we only allow 33 bits of pseudo-physical
+    // address in translated guests (i.e. 8 GBytes).  This restriction
+    // comes from wanting to map the P2M table into the 16MB RO_MPT hole
+    // in Xen's address space for translated PV guests.
+    //
+    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+                         L3_PAGETABLE_SHIFT - PAGE_SHIFT,
+                         (CONFIG_PAGING_LEVELS == 3
+                          ? 8
+                          : L3_PAGETABLE_ENTRIES),
+                         PGT_l2_page_table) )
+        return 0;
+#endif
+    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+                         L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+                         L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
+        return 0;
+
+    p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+                               0, L1_PAGETABLE_ENTRIES);
+    ASSERT(p2m_entry);
+    if ( valid_mfn(mfn) )
+        *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
+    else
+        *p2m_entry = l1e_empty();
+
+    /* The P2M can be shadowed: keep the shadows synced */
+    (void) __shadow2_validate_guest_entry(d->vcpu[0], table_mfn, 
+                                          p2m_entry, sizeof *p2m_entry);
+
+    sh2_unmap_domain_page(table);
+
+    return 1;
+}
+
+// Allocate a new p2m table for a domain.
+//
+// The structure of the p2m table is that of a pagetable for xen (i.e. it is
+// controlled by CONFIG_PAGING_LEVELS).
+//
+// Returns 0 if p2m table could not be initialized
+//
+static int
+shadow2_alloc_p2m_table(struct domain *d)
+{
+    mfn_t p2m_top;
+    struct list_head *entry;
+    unsigned int page_count = 0;
+    
+    SHADOW2_PRINTK("allocating p2m table\n");
+    ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0);
+
+    p2m_top = shadow2_alloc_p2m_page(d);
+    mfn_to_page(p2m_top)->count_info = 1;
+    mfn_to_page(p2m_top)->u.inuse.type_info = 
+#if CONFIG_PAGING_LEVELS == 4
+        PGT_l4_page_table
+#elif CONFIG_PAGING_LEVELS == 3
+        PGT_l3_page_table
+#elif CONFIG_PAGING_LEVELS == 2
+        PGT_l2_page_table
+#endif
+        | 1 | PGT_validated;
+   
+    if ( mfn_x(p2m_top) == 0 )
+        return 0;
+
+    d->arch.phys_table = pagetable_from_mfn(p2m_top);
+
+    SHADOW2_PRINTK("populating p2m table\n");
+ 
+    for ( entry = d->page_list.next;
+          entry != &d->page_list;
+          entry = entry->next )
+    {
+        struct page_info *page = list_entry(entry, struct page_info, list);
+        mfn_t mfn = page_to_mfn(page);
+        unsigned long gfn = get_gpfn_from_mfn(mfn_x(mfn));
+        page_count++;
+        if (
+#ifdef __x86_64__
+            (gfn != 0x5555555555555555L)
+#else
+            (gfn != 0x55555555L)
+#endif
+             && gfn != INVALID_M2P_ENTRY
+             && !shadow2_set_p2m_entry(d, gfn, mfn) )
+        {
+            SHADOW2_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%" 
SH2_PRI_mfn "\n",
+                           gfn, mfn_x(mfn));
+            return 0;
+        }
+    }
+
+    SHADOW2_PRINTK("p2m table initialised (%u pages)\n", page_count);
+    return 1;
+}
+
+mfn_t
+sh2_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
+/* Read another domain's p2m entries */
+{
+    mfn_t mfn;
+    unsigned long addr = gpfn << PAGE_SHIFT;
+    l2_pgentry_t *l2e;
+    l1_pgentry_t *l1e;
+    
+    ASSERT(shadow2_mode_translate(d));
+    mfn = pagetable_get_mfn(d->arch.phys_table);
+
+
+#if CONFIG_PAGING_LEVELS > 2
+    if ( gpfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) ) 
+        /* This pfn is higher than the p2m map can hold */
+        return _mfn(INVALID_MFN);
+#endif
+
+
+#if CONFIG_PAGING_LEVELS >= 4
+    { 
+        l4_pgentry_t *l4e = sh2_map_domain_page(mfn);
+        l4e += l4_table_offset(addr);
+        if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
+        {
+            sh2_unmap_domain_page(l4e);
+            return _mfn(INVALID_MFN);
+        }
+        mfn = _mfn(l4e_get_pfn(*l4e));
+        sh2_unmap_domain_page(l4e);
+    }
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+    {
+        l3_pgentry_t *l3e = sh2_map_domain_page(mfn);
+        l3e += l3_table_offset(addr);
+        if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
+        {
+            sh2_unmap_domain_page(l3e);
+            return _mfn(INVALID_MFN);
+        }
+        mfn = _mfn(l3e_get_pfn(*l3e));
+        sh2_unmap_domain_page(l3e);
+    }
+#endif
+
+    l2e = sh2_map_domain_page(mfn);
+    l2e += l2_table_offset(addr);
+    if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
+    {
+        sh2_unmap_domain_page(l2e);
+        return _mfn(INVALID_MFN);
+    }
+    mfn = _mfn(l2e_get_pfn(*l2e));
+    sh2_unmap_domain_page(l2e);
+
+    l1e = sh2_map_domain_page(mfn);
+    l1e += l1_table_offset(addr);
+    if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
+    {
+        sh2_unmap_domain_page(l1e);
+        return _mfn(INVALID_MFN);
+    }
+    mfn = _mfn(l1e_get_pfn(*l1e));
+    sh2_unmap_domain_page(l1e);
+
+    return mfn;
+}
+
+unsigned long
+shadow2_gfn_to_mfn_foreign(unsigned long gpfn)
+{
+    return mfn_x(sh2_gfn_to_mfn_foreign(current->domain, gpfn));
+}
+
+
+static void shadow2_p2m_teardown(struct domain *d)
+/* Return all the p2m pages to Xen.
+ * We know we don't have any extra mappings to these pages */
+{
+    struct list_head *entry, *n;
+    struct page_info *pg;
+
+    d->arch.phys_table = pagetable_null();
+
+    list_for_each_safe(entry, n, &d->arch.shadow2_p2m_inuse)
+    {
+        pg = list_entry(entry, struct page_info, list);
+        list_del(entry);
+        /* Should have just the one ref we gave it in alloc_p2m_page() */
+        if ( (pg->count_info & PGC_SH2_count_mask) != 1 )
+        {
+            SHADOW2_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n",
+                           pg->count_info, pg->u.inuse.type_info);
+        }
+        ASSERT(page_get_owner(pg) == d);
+        /* Free should not decrement domain's total allocation, since 
+         * these pages were allocated without an owner. */
+        page_set_owner(pg, NULL); 
+        free_domheap_pages(pg, 0);
+        d->arch.shadow2_p2m_pages--;
+        perfc_decr(shadow2_alloc_count);
+    }
+    list_for_each_safe(entry, n, &d->arch.shadow2_p2m_freelist)
+    {
+        list_del(entry);
+        pg = list_entry(entry, struct page_info, list);
+        ASSERT(page_get_owner(pg) == d);
+        /* Free should not decrement domain's total allocation. */
+        page_set_owner(pg, NULL); 
+        free_domheap_pages(pg, 0);
+        d->arch.shadow2_p2m_pages--;
+        perfc_decr(shadow2_alloc_count);
+    }
+    ASSERT(d->arch.shadow2_p2m_pages == 0);
+}
+
+/* Set the pool of shadow pages to the required number of pages.
+ * Input will be rounded up to at least shadow2_min_acceptable_pages(),
+ * plus space for the p2m table.
+ * Returns 0 for success, non-zero for failure. */
+static unsigned int set_sh2_allocation(struct domain *d, 
+                                       unsigned int pages,
+                                       int *preempted)
+{
+    struct page_info *pg;
+    unsigned int lower_bound;
+    int j;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    
+    /* Don't allocate less than the minimum acceptable, plus one page per
+     * megabyte of RAM (for the p2m table) */
+    lower_bound = shadow2_min_acceptable_pages(d) + (d->tot_pages / 256);
+    if ( pages > 0 && pages < lower_bound )
+        pages = lower_bound;
+    /* Round up to largest block size */
+    pages = (pages + ((1<<SHADOW2_MAX_ORDER)-1)) & ~((1<<SHADOW2_MAX_ORDER)-1);
+
+    SHADOW2_PRINTK("current %i target %i\n", 
+                   d->arch.shadow2_total_pages, pages);
+
+    while ( d->arch.shadow2_total_pages != pages ) 
+    {
+        if ( d->arch.shadow2_total_pages < pages ) 
+        {
+            /* Need to allocate more memory from domheap */
+            pg = alloc_domheap_pages(NULL, SHADOW2_MAX_ORDER, 0); 
+            if ( pg == NULL ) 
+            { 
+                SHADOW2_PRINTK("failed to allocate shadow pages.\n");
+                return -ENOMEM;
+            }
+            d->arch.shadow2_free_pages += 1<<SHADOW2_MAX_ORDER;
+            d->arch.shadow2_total_pages += 1<<SHADOW2_MAX_ORDER;
+            for ( j = 0; j < 1<<SHADOW2_MAX_ORDER; j++ ) 
+            {
+                pg[j].u.inuse.type_info = 0;  /* Free page */
+                pg[j].tlbflush_timestamp = 0; /* Not in any TLB */
+            }
+            SH2_SET_PFN_ORDER(pg, SHADOW2_MAX_ORDER);
+            list_add_tail(&pg->list, 
+                          &d->arch.shadow2_freelists[SHADOW2_MAX_ORDER]);
+        } 
+        else if ( d->arch.shadow2_total_pages > pages ) 
+        {
+            /* Need to return memory to domheap */
+            shadow2_prealloc(d, SHADOW2_MAX_ORDER);
+            ASSERT(!list_empty(&d->arch.shadow2_freelists[SHADOW2_MAX_ORDER]));
+            pg = list_entry(d->arch.shadow2_freelists[SHADOW2_MAX_ORDER].next, 
+                            struct page_info, list);
+            list_del(&pg->list);
+            d->arch.shadow2_free_pages -= 1<<SHADOW2_MAX_ORDER;
+            d->arch.shadow2_total_pages -= 1<<SHADOW2_MAX_ORDER;
+            free_domheap_pages(pg, SHADOW2_MAX_ORDER);
+        }
+
+        /* Check to see if we need to yield and try again */
+        if ( preempted && hypercall_preempt_check() )
+        {
+            *preempted = 1;
+            return 0;
+        }
+    }
+
+    return 0;
+}
+
+unsigned int shadow2_set_allocation(struct domain *d, 
+                                    unsigned int megabytes,
+                                    int *preempted)
+/* Hypercall interface to set the shadow memory allocation */
+{
+    unsigned int rv;
+    shadow2_lock(d);
+    rv = set_sh2_allocation(d, megabytes << (20 - PAGE_SHIFT), preempted); 
+    SHADOW2_PRINTK("dom %u allocation now %u pages (%u MB)\n",
+                   d->domain_id,
+                   d->arch.shadow2_total_pages,
+                   shadow2_get_allocation(d));
+    shadow2_unlock(d);
+    return rv;
+}
+
+/**************************************************************************/
+/* Hash table for storing the guest->shadow mappings */
+
+/* Hash function that takes a gfn or mfn, plus another byte of type info */
+typedef u32 key_t;
+static inline key_t sh2_hash(unsigned long n, u8 t) 
+{
+    unsigned char *p = (unsigned char *)&n;
+    key_t k = t;
+    int i;
+    for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
+    return k;
+}
+
+#if SHADOW2_AUDIT & (SHADOW2_AUDIT_HASH|SHADOW2_AUDIT_HASH_FULL)
+
+/* Before we get to the mechanism, define a pair of audit functions
+ * that sanity-check the contents of the hash table. */
+static void sh2_hash_audit_bucket(struct domain *d, int bucket)
+/* Audit one bucket of the hash table */
+{
+    struct shadow2_hash_entry *e, *x;
+    struct page_info *pg;
+
+    if ( !(SHADOW2_AUDIT_ENABLE) )
+        return;
+
+    e = &d->arch.shadow2_hash_table[bucket];
+    if ( e->t == 0 ) return; /* Bucket is empty */ 
+    while ( e )
+    {
+        /* Empty link? */
+        BUG_ON( e->t == 0 ); 
+        /* Bogus type? */
+        BUG_ON( e->t > (PGC_SH2_max_shadow >> PGC_SH2_type_shift) );
+        /* Wrong bucket? */
+        BUG_ON( sh2_hash(e->n, e->t) % SHADOW2_HASH_BUCKETS != bucket ); 
+        /* Duplicate entry? */
+        for ( x = e->next; x; x = x->next )
+            BUG_ON( x->n == e->n && x->t == e->t );
+        /* Bogus MFN? */
+        BUG_ON( !valid_mfn(e->smfn) );
+        pg = mfn_to_page(e->smfn);
+        /* Not a shadow? */
+        BUG_ON( page_get_owner(pg) != 0 );
+        /* Wrong kind of shadow? */
+        BUG_ON( (pg->count_info & PGC_SH2_type_mask) >> PGC_SH2_type_shift 
+                != e->t ); 
+        /* Bad backlink? */
+        BUG_ON( pg->u.inuse.type_info != e->n );
+        if ( e->t != (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift)
+             && e->t != (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift)
+             && e->t != (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift) )
+        {
+            /* Bad shadow flags on guest page? */
+            BUG_ON( !(mfn_to_page(_mfn(e->n))->shadow2_flags & (1<<e->t)) );
+        }
+        /* That entry was OK; on we go */
+        e = e->next;
+    }
+}
+
+#else
+#define sh2_hash_audit_bucket(_d, _b)
+#endif /* Hashtable bucket audit */
+
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_HASH_FULL
+
+static void sh2_hash_audit(struct domain *d)
+/* Full audit: audit every bucket in the table */
+{
+    int i;
+
+    if ( !(SHADOW2_AUDIT_ENABLE) )
+        return;
+
+    for ( i = 0; i < SHADOW2_HASH_BUCKETS; i++ ) 
+    {
+        sh2_hash_audit_bucket(d, i);
+    }
+}
+
+#else
+#define sh2_hash_audit(_d)
+#endif /* Hashtable bucket audit */
+
+/* Memory management interface for bucket allocation.
+ * These ought to come out of shadow memory, but at least on 32-bit
+ * machines we are forced to allocate them from xenheap so that we can
+ * address them. */
+static struct shadow2_hash_entry *sh2_alloc_hash_entry(struct domain *d)
+{
+    struct shadow2_hash_entry *extra, *x;
+    int i;
+
+    /* We need to allocate a new node. Ensure the free list is not empty. 
+     * Allocate new entries in units the same size as the original table. */
+    if ( unlikely(d->arch.shadow2_hash_freelist == NULL) )
+    {
+        size_t sz = sizeof(void *) + (SHADOW2_HASH_BUCKETS * sizeof(*x));
+        extra = xmalloc_bytes(sz);
+
+        if ( extra == NULL )
+        {
+            /* No memory left! */
+            SHADOW2_ERROR("xmalloc() failed when allocating hash buckets.\n");
+            domain_crash_synchronous();
+        }
+        memset(extra, 0, sz);
+
+        /* Record the allocation block so it can be correctly freed later. */
+        *((struct shadow2_hash_entry **)&extra[SHADOW2_HASH_BUCKETS]) = 
+            d->arch.shadow2_hash_allocations;
+        d->arch.shadow2_hash_allocations = &extra[0];
+
+        /* Thread a free chain through the newly-allocated nodes. */
+        for ( i = 0; i < (SHADOW2_HASH_BUCKETS - 1); i++ )
+            extra[i].next = &extra[i+1];
+        extra[i].next = NULL;
+
+        /* Add the new nodes to the free list. */
+        d->arch.shadow2_hash_freelist = &extra[0];
+    }
+
+    /* Allocate a new node from the free list. */
+    x = d->arch.shadow2_hash_freelist;
+    d->arch.shadow2_hash_freelist = x->next;
+    return x;
+}
+
+static void sh2_free_hash_entry(struct domain *d, struct shadow2_hash_entry *e)
+{
+    /* Mark the bucket as empty and return it to the free list */
+    e->t = 0; 
+    e->next = d->arch.shadow2_hash_freelist;
+    d->arch.shadow2_hash_freelist = e;
+}
+
+
+/* Allocate and initialise the table itself.  
+ * Returns 0 for success, 1 for error. */
+static int shadow2_hash_alloc(struct domain *d)
+{
+    struct shadow2_hash_entry *table;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(!d->arch.shadow2_hash_table);
+
+    table = xmalloc_array(struct shadow2_hash_entry, SHADOW2_HASH_BUCKETS);
+    if ( !table ) return 1;
+    memset(table, 0, 
+           SHADOW2_HASH_BUCKETS * sizeof (struct shadow2_hash_entry));
+    d->arch.shadow2_hash_table = table;
+    return 0;
+}
+
+/* Tear down the hash table and return all memory to Xen.
+ * This function does not care whether the table is populated. */
+static void shadow2_hash_teardown(struct domain *d)
+{
+    struct shadow2_hash_entry *a, *n;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(d->arch.shadow2_hash_table);
+
+    /* Return the table itself */
+    xfree(d->arch.shadow2_hash_table);
+    d->arch.shadow2_hash_table = NULL;
+
+    /* Return any extra allocations */
+    a = d->arch.shadow2_hash_allocations;
+    while ( a ) 
+    {
+        /* We stored a linked-list pointer at the end of each allocation */
+        n = *((struct shadow2_hash_entry **)(&a[SHADOW2_HASH_BUCKETS]));
+        xfree(a);
+        a = n;
+    }
+    d->arch.shadow2_hash_allocations = NULL;
+    d->arch.shadow2_hash_freelist = NULL;
+}
+
+
+mfn_t shadow2_hash_lookup(struct vcpu *v, unsigned long n, u8 t)
+/* Find an entry in the hash table.  Returns the MFN of the shadow,
+ * or INVALID_MFN if it doesn't exist */
+{
+    struct domain *d = v->domain;
+    struct shadow2_hash_entry *p, *x, *head;
+    key_t key;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(d->arch.shadow2_hash_table);
+    ASSERT(t);
+
+    sh2_hash_audit(d);
+
+    perfc_incrc(shadow2_hash_lookups);
+    key = sh2_hash(n, t);
+
+    x = head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS];
+    p = NULL;
+
+    sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+
+    do
+    {
+        ASSERT(x->t || ((x == head) && (x->next == NULL)));
+
+        if ( x->n == n && x->t == t )
+        {
+            /* Pull-to-front if 'x' isn't already the head item */
+            if ( unlikely(x != head) )
+            {
+                if ( unlikely(d->arch.shadow2_hash_walking != 0) )
+                    /* Can't reorder: someone is walking the hash chains */
+                    return x->smfn;
+                else 
+                {
+                    /* Delete 'x' from list and reinsert after head. */
+                    p->next = x->next;
+                    x->next = head->next;
+                    head->next = x;
+                    
+                    /* Swap 'x' contents with head contents. */
+                    SWAP(head->n, x->n);
+                    SWAP(head->t, x->t);
+                    SWAP(head->smfn, x->smfn);
+                }
+            }
+            else
+            {
+                perfc_incrc(shadow2_hash_lookup_head);
+            }
+            return head->smfn;
+        }
+
+        p = x;
+        x = x->next;
+    }
+    while ( x != NULL );
+
+    perfc_incrc(shadow2_hash_lookup_miss);
+    return _mfn(INVALID_MFN);
+}
+
+void shadow2_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
+/* Put a mapping (n,t)->smfn into the hash table */
+{
+    struct domain *d = v->domain;
+    struct shadow2_hash_entry *x, *head;
+    key_t key;
+    
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(d->arch.shadow2_hash_table);
+    ASSERT(t);
+
+    sh2_hash_audit(d);
+
+    perfc_incrc(shadow2_hash_inserts);
+    key = sh2_hash(n, t);
+
+    head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS];
+
+    sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+
+    /* If the bucket is empty then insert the new page as the head item. */
+    if ( head->t == 0 )
+    {
+        head->n = n;
+        head->t = t;
+        head->smfn = smfn;
+        ASSERT(head->next == NULL);
+    }
+    else 
+    {
+        /* Insert a new entry directly after the head item. */
+        x = sh2_alloc_hash_entry(d);
+        x->n = n; 
+        x->t = t;
+        x->smfn = smfn;
+        x->next = head->next;
+        head->next = x;
+    }
+    
+    sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+}
+
+void shadow2_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
+/* Excise the mapping (n,t)->smfn from the hash table */
+{
+    struct domain *d = v->domain;
+    struct shadow2_hash_entry *p, *x, *head;
+    key_t key;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(d->arch.shadow2_hash_table);
+    ASSERT(t);
+
+    sh2_hash_audit(d);
+
+    perfc_incrc(shadow2_hash_deletes);
+    key = sh2_hash(n, t);
+
+    head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS];
+
+    sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+
+    /* Match on head item? */
+    if ( head->n == n && head->t == t )
+    {
+        if ( (x = head->next) != NULL )
+        {
+            /* Overwrite head with contents of following node. */
+            head->n = x->n;
+            head->t = x->t;
+            head->smfn = x->smfn;
+
+            /* Delete following node. */
+            head->next = x->next;
+            sh2_free_hash_entry(d, x);
+        }
+        else
+        {
+            /* This bucket is now empty. Initialise the head node. */
+            head->t = 0;
+        }
+    }
+    else 
+    {
+        /* Not at the head; need to walk the chain */
+        p = head;
+        x = head->next; 
+        
+        while(1)
+        {
+            ASSERT(x); /* We can't have hit the end, since our target is
+                        * still in the chain somehwere... */
+            if ( x->n == n && x->t == t )
+            {
+                /* Delete matching node. */
+                p->next = x->next;
+                sh2_free_hash_entry(d, x);
+                break;
+            }
+            p = x;
+            x = x->next;
+        }
+    }
+
+    sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+}
+
+typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
+
+static void hash_foreach(struct vcpu *v, 
+                         unsigned int callback_mask, 
+                         hash_callback_t callbacks[], 
+                         mfn_t callback_mfn)
+/* Walk the hash table looking at the types of the entries and 
+ * calling the appropriate callback function for each entry. 
+ * The mask determines which shadow types we call back for, and the array
+ * of callbacks tells us which function to call.
+ * Any callback may return non-zero to let us skip the rest of the scan. 
+ *
+ * WARNING: Callbacks MUST NOT add or remove hash entries unless they 
+ * then return non-zero to terminate the scan. */
+{
+    int i, done = 0;
+    struct domain *d = v->domain;
+    struct shadow2_hash_entry *x;
+
+    /* Say we're here, to stop hash-lookups reordering the chains */
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(d->arch.shadow2_hash_walking == 0);
+    d->arch.shadow2_hash_walking = 1;
+
+    callback_mask &= ~1; /* Never attempt to call back on empty buckets */
+    for ( i = 0; i < SHADOW2_HASH_BUCKETS; i++ ) 
+    {
+        /* WARNING: This is not safe against changes to the hash table.
+         * The callback *must* return non-zero if it has inserted or
+         * deleted anything from the hash (lookups are OK, though). */
+        for ( x = &d->arch.shadow2_hash_table[i]; x; x = x->next )
+        {
+            if ( callback_mask & (1 << x->t) ) 
+            {
+                ASSERT(x->t <= 15);
+                ASSERT(callbacks[x->t] != NULL);
+                if ( (done = callbacks[x->t](v, x->smfn, callback_mfn)) != 0 )
+                    break;
+            }
+        }
+        if ( done ) break; 
+    }
+    d->arch.shadow2_hash_walking = 0; 
+}
+
+
+/**************************************************************************/
+/* Destroy a shadow page: simple dispatcher to call the per-type destructor
+ * which will decrement refcounts appropriately and return memory to the 
+ * free pool. */
+
+void sh2_destroy_shadow(struct vcpu *v, mfn_t smfn)
+{
+    struct page_info *pg = mfn_to_page(smfn);
+    u32 t = pg->count_info & PGC_SH2_type_mask;
+
+
+    SHADOW2_PRINTK("smfn=%#lx\n", mfn_x(smfn));
+
+    /* Double-check, if we can, that the shadowed page belongs to this
+     * domain, (by following the back-pointer). */
+    ASSERT(t == PGC_SH2_fl1_32_shadow  ||  
+           t == PGC_SH2_fl1_pae_shadow ||  
+           t == PGC_SH2_fl1_64_shadow  || 
+           t == PGC_SH2_monitor_table  || 
+           (page_get_owner(mfn_to_page(_mfn(pg->u.inuse.type_info))) 
+            == v->domain)); 
+
+    /* The down-shifts here are so that the switch statement is on nice
+     * small numbers that the compiler will enjoy */
+    switch ( t >> PGC_SH2_type_shift )
+    {
+#if CONFIG_PAGING_LEVELS == 2
+    case PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift:
+    case PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 2, 2)(v, smfn); 
+        break;
+    case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 2, 2)(v, smfn);
+        break;
+#else /* PAE or 64bit */
+    case PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift:
+    case PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 3, 2)(v, smfn);
+        break;
+    case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 3, 2)(v, smfn);
+        break;
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 3
+    case PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift:
+    case PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 3, 3)(v, smfn);
+        break;
+    case PGC_SH2_l2_pae_shadow >> PGC_SH2_type_shift:
+    case PGC_SH2_l2h_pae_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 3, 3)(v, smfn);
+        break;
+    case PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, 3, 3)(v, smfn);
+        break;
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 4
+    case PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift:
+    case PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 4, 4)(v, smfn);
+        break;
+    case PGC_SH2_l2_64_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 4, 4)(v, smfn);
+        break;
+    case PGC_SH2_l3_64_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, 4, 4)(v, smfn);
+        break;
+    case PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l4_shadow, 4, 4)(v, smfn);
+        break;
+#endif
+    default:
+        SHADOW2_PRINTK("tried to destroy shadow of bad type %08lx\n", 
+                       (unsigned long)t);
+        BUG();
+    }    
+}
+
+/**************************************************************************/
+/* Remove all writeable mappings of a guest frame from the shadow tables 
+ * Returns non-zero if we need to flush TLBs. 
+ * level and fault_addr desribe how we found this to be a pagetable;
+ * level==0 means we have some other reason for revoking write access.*/
+
+int shadow2_remove_write_access(struct vcpu *v, mfn_t gmfn, 
+                                unsigned int level,
+                                unsigned long fault_addr)
+{
+    /* Dispatch table for getting per-type functions */
+    static hash_callback_t callbacks[16] = {
+        NULL, /* none    */
+#if CONFIG_PAGING_LEVELS == 2
+        SHADOW2_INTERNAL_NAME(sh2_remove_write_access,2,2), /* l1_32   */
+        SHADOW2_INTERNAL_NAME(sh2_remove_write_access,2,2), /* fl1_32  */
+#else 
+        SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,2), /* l1_32   */
+        SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,2), /* fl1_32  */
+#endif
+        NULL, /* l2_32   */
+#if CONFIG_PAGING_LEVELS >= 3
+        SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,3), /* l1_pae  */
+        SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,3), /* fl1_pae */
+#else 
+        NULL, /* l1_pae  */
+        NULL, /* fl1_pae */
+#endif
+        NULL, /* l2_pae  */
+        NULL, /* l2h_pae */
+        NULL, /* l3_pae  */
+#if CONFIG_PAGING_LEVELS >= 4
+        SHADOW2_INTERNAL_NAME(sh2_remove_write_access,4,4), /* l1_64   */
+        SHADOW2_INTERNAL_NAME(sh2_remove_write_access,4,4), /* fl1_64  */
+#else
+        NULL, /* l1_64   */
+        NULL, /* fl1_64  */
+#endif
+        NULL, /* l2_64   */
+        NULL, /* l3_64   */
+        NULL, /* l4_64   */
+        NULL, /* p2m     */
+        NULL  /* unused  */
+    };
+
+    static unsigned int callback_mask = 
+          1 << (PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift)
+        ;
+    struct page_info *pg = mfn_to_page(gmfn);
+
+    ASSERT(shadow2_lock_is_acquired(v->domain));
+
+    /* Only remove writable mappings if we are doing shadow refcounts.
+     * In guest refcounting, we trust Xen to already be restricting
+     * all the writes to the guest page tables, so we do not need to
+     * do more. */
+    if ( !shadow2_mode_refcounts(v->domain) )
+        return 0;
+
+    /* Early exit if it's already a pagetable, or otherwise not writeable */
+    if ( sh2_mfn_is_a_page_table(gmfn) 
+         || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
+        return 0;
+
+    perfc_incrc(shadow2_writeable);
+
+    /* If this isn't a "normal" writeable page, the domain is trying to 
+     * put pagetables in special memory of some kind.  We can't allow that. */
+    if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
+    {
+        SHADOW2_ERROR("can't remove write access to mfn %lx, type_info is %" 
+                      PRtype_info "\n",
+                      mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
+        domain_crash(v->domain);
+    }
+
+#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC
+    if ( v == current && level != 0 )
+    {
+        unsigned long gfn;
+        /* Heuristic: there is likely to be only one writeable mapping,
+         * and that mapping is likely to be in the current pagetable,
+         * either in the guest's linear map (linux, windows) or in a
+         * magic slot used to map high memory regions (linux HIGHTPTE) */
+
+#define GUESS(_a, _h) do {                                              \
+            if ( v->arch.shadow2->guess_wrmap(v, (_a), gmfn) )          \
+                perfc_incrc(shadow2_writeable_h_ ## _h);                \
+            if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )        \
+                return 1;                                               \
+        } while (0)
+
+        
+        /* Linux lowmem: first 1GB is mapped 1-to-1 above 0xC0000000 */
+        if ( v == current 
+             && (gfn = sh2_mfn_to_gfn(v->domain, gmfn)) < 0x40000000 )
+            GUESS(0xC0000000 + (gfn << PAGE_SHIFT), 4);
+
+        if ( v->arch.shadow2->guest_levels == 2 )
+        {
+            if ( level == 1 )
+                /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
+                GUESS(0xC0000000UL + (fault_addr >> 10), 1);
+        }
+#if CONFIG_PAGING_LEVELS >= 3
+        else if ( v->arch.shadow2->guest_levels == 3 )
+        {
+            /* 32bit PAE w2k3: linear map at 0xC0000000 */
+            switch ( level ) 
+            {
+            case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
+            case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
+            }
+        }
+#if CONFIG_PAGING_LEVELS >= 4
+        else if ( v->arch.shadow2->guest_levels == 4 )
+        {
+            /* 64bit w2k3: linear map at 0x0000070000000000 */
+            switch ( level ) 
+            {
+            case 1: GUESS(0x70000000000UL + (fault_addr >> 9), 3); break;
+            case 2: GUESS(0x70380000000UL + (fault_addr >> 18), 3); break;
+            case 3: GUESS(0x70381C00000UL + (fault_addr >> 27), 3); break;
+            }
+        }
+#endif /* CONFIG_PAGING_LEVELS >= 4 */
+#endif /* CONFIG_PAGING_LEVELS >= 3 */
+
+#undef GUESS
+
+    }
+#endif
+    
+    /* Brute-force search of all the shadows, by walking the hash */
+    perfc_incrc(shadow2_writeable_bf);
+    hash_foreach(v, callback_mask, callbacks, gmfn);
+
+    /* If that didn't catch the mapping, something is very wrong */
+    if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
+    {
+        SHADOW2_ERROR("can't find all writeable mappings of mfn %lx: "
+                      "%lu left\n", mfn_x(gmfn),
+                      (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
+        domain_crash(v->domain);
+    }
+    
+    /* We killed at least one writeable mapping, so must flush TLBs. */
+    return 1;
+}
+
+
+
+/**************************************************************************/
+/* Remove all mappings of a guest frame from the shadow tables.
+ * Returns non-zero if we need to flush TLBs. */
+
+int shadow2_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
+{
+    struct page_info *page = mfn_to_page(gmfn);
+    int expected_count;
+
+    /* Dispatch table for getting per-type functions */
+    static hash_callback_t callbacks[16] = {
+        NULL, /* none    */
+#if CONFIG_PAGING_LEVELS == 2
+        SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,2,2), /* l1_32   */
+        SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,2,2), /* fl1_32  */
+#else 
+        SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,2), /* l1_32   */
+        SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,2), /* fl1_32  */
+#endif
+        NULL, /* l2_32   */
+#if CONFIG_PAGING_LEVELS >= 3
+        SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,3), /* l1_pae  */
+        SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,3), /* fl1_pae */
+#else 
+        NULL, /* l1_pae  */
+        NULL, /* fl1_pae */
+#endif
+        NULL, /* l2_pae  */
+        NULL, /* l2h_pae */
+        NULL, /* l3_pae  */
+#if CONFIG_PAGING_LEVELS >= 4
+        SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,4,4), /* l1_64   */
+        SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,4,4), /* fl1_64  */
+#else
+        NULL, /* l1_64   */
+        NULL, /* fl1_64  */
+#endif
+        NULL, /* l2_64   */
+        NULL, /* l3_64   */
+        NULL, /* l4_64   */
+        NULL, /* p2m     */
+        NULL  /* unused  */
+    };
+
+    static unsigned int callback_mask = 
+          1 << (PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift)
+        ;
+
+    perfc_incrc(shadow2_mappings);
+    if ( (page->count_info & PGC_count_mask) == 0 )
+        return 0;
+
+    ASSERT(shadow2_lock_is_acquired(v->domain));
+
+    /* XXX TODO: 
+     * Heuristics for finding the (probably) single mapping of this gmfn */
+    
+    /* Brute-force search of all the shadows, by walking the hash */
+    perfc_incrc(shadow2_mappings_bf);
+    hash_foreach(v, callback_mask, callbacks, gmfn);
+
+    /* If that didn't catch the mapping, something is very wrong */
+    expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
+    if ( (page->count_info & PGC_count_mask) != expected_count )
+    {
+        /* Don't complain if we're in HVM and there's one extra mapping: 
+         * The qemu helper process has an untyped mapping of this dom's RAM */
+        if ( !(shadow2_mode_external(v->domain)
+               && (page->count_info & PGC_count_mask) <= 2
+               && (page->u.inuse.type_info & PGT_count_mask) == 0) )
+        {
+            SHADOW2_ERROR("can't find all mappings of mfn %lx: "
+                          "c=%08x t=%08lx\n", mfn_x(gmfn), 
+                          page->count_info, page->u.inuse.type_info);
+        }
+    }
+
+    /* We killed at least one mapping, so must flush TLBs. */
+    return 1;
+}
+
+
+/**************************************************************************/
+/* Remove all shadows of a guest frame from the shadow tables */
+
+static int sh2_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
+/* Follow this shadow's up-pointer, if it has one, and remove the reference
+ * found there.  Returns 1 if that was the only reference to this shadow */
+{
+    struct page_info *pg = mfn_to_page(smfn);
+    mfn_t pmfn;
+    void *vaddr;
+    int rc;
+
+    ASSERT((pg->count_info & PGC_SH2_type_mask) > 0);
+    ASSERT((pg->count_info & PGC_SH2_type_mask) < PGC_SH2_max_shadow);
+    ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l2_32_shadow);
+    ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l3_pae_shadow);
+    ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l4_64_shadow);
+    
+    if (pg->up == 0) return 0;
+    pmfn = _mfn(pg->up >> PAGE_SHIFT);
+    ASSERT(valid_mfn(pmfn));
+    vaddr = sh2_map_domain_page(pmfn);
+    ASSERT(vaddr);
+    vaddr += pg->up & (PAGE_SIZE-1);
+    ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
+    
+    /* Is this the only reference to this shadow? */
+    rc = ((pg->count_info & PGC_SH2_count_mask) == 1) ? 1 : 0;
+
+    /* Blank the offending entry */
+    switch ((pg->count_info & PGC_SH2_type_mask)) 
+    {
+    case PGC_SH2_l1_32_shadow:
+    case PGC_SH2_l2_32_shadow:
+#if CONFIG_PAGING_LEVELS == 2
+        SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,2,2)(v, vaddr, pmfn);
+#else
+        SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,3,2)(v, vaddr, pmfn);
+#endif
+        break;
+#if CONFIG_PAGING_LEVELS >=3
+    case PGC_SH2_l1_pae_shadow:
+    case PGC_SH2_l2_pae_shadow:
+    case PGC_SH2_l2h_pae_shadow:
+    case PGC_SH2_l3_pae_shadow:
+        SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,3,3)(v, vaddr, pmfn);
+        break;
+#if CONFIG_PAGING_LEVELS >= 4
+    case PGC_SH2_l1_64_shadow:
+    case PGC_SH2_l2_64_shadow:
+    case PGC_SH2_l3_64_shadow:
+    case PGC_SH2_l4_64_shadow:
+        SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,4,4)(v, vaddr, pmfn);
+        break;
+#endif
+#endif
+    default: BUG(); /* Some wierd unknown shadow type */
+    }
+    
+    sh2_unmap_domain_page(vaddr);
+    if ( rc )
+        perfc_incrc(shadow2_up_pointer);
+    else
+        perfc_incrc(shadow2_unshadow_bf);
+
+    return rc;
+}
+
+void sh2_remove_shadows(struct vcpu *v, mfn_t gmfn, int all)
+/* Remove the shadows of this guest page.  
+ * If all != 0, find all shadows, if necessary by walking the tables.
+ * Otherwise, just try the (much faster) heuristics, which will remove 
+ * at most one reference to each shadow of the page. */
+{
+    struct page_info *pg;
+    mfn_t smfn;
+    u32 sh_flags;
+    unsigned char t;
+
+    /* Dispatch table for getting per-type functions: each level must
+     * be called with the function to remove a lower-level shadow. */
+    static hash_callback_t callbacks[16] = {
+        NULL, /* none    */
+        NULL, /* l1_32   */
+        NULL, /* fl1_32  */
+#if CONFIG_PAGING_LEVELS == 2
+        SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,2,2), /* l2_32   */
+#else 
+        SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,2), /* l2_32   */
+#endif
+        NULL, /* l1_pae  */
+        NULL, /* fl1_pae */
+#if CONFIG_PAGING_LEVELS >= 3
+        SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,3), /* l2_pae  */
+        SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,3), /* l2h_pae */
+        SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow,3,3), /* l3_pae  */
+#else 
+        NULL, /* l2_pae  */
+        NULL, /* l2h_pae */
+        NULL, /* l3_pae  */
+#endif
+        NULL, /* l1_64   */
+        NULL, /* fl1_64  */
+#if CONFIG_PAGING_LEVELS >= 4
+        SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,4,4), /* l2_64   */
+        SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow,4,4), /* l3_64   */
+        SHADOW2_INTERNAL_NAME(sh2_remove_l3_shadow,4,4), /* l4_64   */
+#else
+        NULL, /* l2_64   */
+        NULL, /* l3_64   */
+        NULL, /* l4_64   */
+#endif
+        NULL, /* p2m     */
+        NULL  /* unused  */
+    };
+
+    /* Another lookup table, for choosing which mask to use */
+    static unsigned int masks[16] = {
+        0, /* none    */
+        1 << (PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift), /* l1_32   */
+        0, /* fl1_32  */
+        0, /* l2_32   */
+        ((1 << (PGC_SH2_l2h_pae_shadow >> PGC_SH2_type_shift))
+         | (1 << (PGC_SH2_l2_pae_shadow >> PGC_SH2_type_shift))), /* l1_pae  */
+        0, /* fl1_pae */
+        1 << (PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift), /* l2_pae  */
+        1 << (PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift), /* l2h_pae  */
+        0, /* l3_pae  */
+        1 << (PGC_SH2_l2_64_shadow >> PGC_SH2_type_shift), /* l1_64   */
+        0, /* fl1_64  */
+        1 << (PGC_SH2_l3_64_shadow >> PGC_SH2_type_shift), /* l2_64   */
+        1 << (PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift), /* l3_64   */
+        0, /* l4_64   */
+        0, /* p2m     */
+        0  /* unused  */
+    };
+
+    SHADOW2_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
+                   v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
+
+    ASSERT(shadow2_lock_is_acquired(v->domain));
+
+    pg = mfn_to_page(gmfn);
+
+    /* Bale out now if the page is not shadowed */
+    if ( (pg->count_info & PGC_page_table) == 0 )
+        return;
+
+    /* Search for this shadow in all appropriate shadows */
+    perfc_incrc(shadow2_unshadow);
+    sh_flags = pg->shadow2_flags;
+
+    /* Lower-level shadows need to be excised from upper-level shadows.
+     * This call to hash_foreach() looks dangerous but is in fact OK: each
+     * call will remove at most one shadow, and terminate immediately when
+     * it does remove it, so we never walk the hash after doing a deletion.  */
+#define DO_UNSHADOW(_type) do {                                 \
+    t = (_type) >> PGC_SH2_type_shift;                          \
+    smfn = shadow2_hash_lookup(v, mfn_x(gmfn), t);              \
+    if ( !sh2_remove_shadow_via_pointer(v, smfn) && all )       \
+        hash_foreach(v, masks[t], callbacks, smfn);             \
+} while (0)
+
+    /* Top-level shadows need to be unpinned */
+#define DO_UNPIN(_type) do {                                             \
+    t = (_type) >> PGC_SH2_type_shift;                                   \
+    smfn = shadow2_hash_lookup(v, mfn_x(gmfn), t);                       \
+    if ( mfn_to_page(smfn)->count_info & PGC_SH2_pinned )                \
+        sh2_unpin(v, smfn);                                              \
+    if ( (_type) == PGC_SH2_l3_pae_shadow )                              \
+        SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows,3,3)(v, smfn); \
+} while (0)
+
+    if ( sh_flags & SH2F_L1_32 )   DO_UNSHADOW(PGC_SH2_l1_32_shadow);
+    if ( sh_flags & SH2F_L2_32 )   DO_UNPIN(PGC_SH2_l2_32_shadow);
+#if CONFIG_PAGING_LEVELS >= 3
+    if ( sh_flags & SH2F_L1_PAE )  DO_UNSHADOW(PGC_SH2_l1_pae_shadow);
+    if ( sh_flags & SH2F_L2_PAE )  DO_UNSHADOW(PGC_SH2_l2_pae_shadow);
+    if ( sh_flags & SH2F_L2H_PAE ) DO_UNSHADOW(PGC_SH2_l2h_pae_shadow);
+    if ( sh_flags & SH2F_L3_PAE )  DO_UNPIN(PGC_SH2_l3_pae_shadow);
+#if CONFIG_PAGING_LEVELS >= 4
+    if ( sh_flags & SH2F_L1_64 )   DO_UNSHADOW(PGC_SH2_l1_64_shadow);
+    if ( sh_flags & SH2F_L2_64 )   DO_UNSHADOW(PGC_SH2_l2_64_shadow);
+    if ( sh_flags & SH2F_L3_64 )   DO_UNSHADOW(PGC_SH2_l3_64_shadow);
+    if ( sh_flags & SH2F_L4_64 )   DO_UNPIN(PGC_SH2_l4_64_shadow);
+#endif
+#endif
+
+#undef DO_UNSHADOW
+#undef DO_UNPIN
+
+
+#if CONFIG_PAGING_LEVELS > 2
+    /* We may have caused some PAE l3 entries to change: need to 
+     * fix up the copies of them in various places */
+    if ( sh_flags & (SH2F_L2_PAE|SH2F_L2H_PAE) )
+        sh2_pae_recopy(v->domain);
+#endif
+
+    /* If that didn't catch the shadows, something is wrong */
+    if ( all && (pg->count_info & PGC_page_table) )
+    {
+        SHADOW2_ERROR("can't find all shadows of mfn %05lx 
(shadow2_flags=%08x)\n",
+                      mfn_x(gmfn), pg->shadow2_flags);
+        domain_crash(v->domain);
+    }
+}
+
+void
+shadow2_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
+/* Even harsher: this is a HVM page that we thing is no longer a pagetable.
+ * Unshadow it, and recursively unshadow pages that reference it. */
+{
+    shadow2_remove_all_shadows(v, gmfn);
+    /* XXX TODO:
+     * Rework this hashtable walker to return a linked-list of all 
+     * the shadows it modified, then do breadth-first recursion 
+     * to find the way up to higher-level tables and unshadow them too. 
+     *
+     * The current code (just tearing down each page's shadows as we
+     * detect that it is not a pagetable) is correct, but very slow. 
+     * It means extra emulated writes and slows down removal of mappings. */
+}
+
+/**************************************************************************/
+
+void sh2_update_paging_modes(struct vcpu *v)
+{
+    struct domain *d = v->domain;
+    struct shadow2_entry_points *old_entries = v->arch.shadow2;
+    mfn_t old_guest_table;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+
+    // Valid transitions handled by this function:
+    // - For PV guests:
+    //     - after a shadow mode has been changed
+    // - For HVM guests:
+    //     - after a shadow mode has been changed
+    //     - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
+    //
+
+    // Avoid determining the current shadow2 mode for uninitialized CPUs, as
+    // we can not yet determine whether it is an HVM or PV domain.
+    //
+    if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
+    {
+        printk("%s: postponing determination of shadow2 mode\n", __func__);
+        return;
+    }
+
+    // First, tear down any old shadow tables held by this vcpu.
+    //
+    if ( v->arch.shadow2 )
+        shadow2_detach_old_tables(v);
+
+    if ( !hvm_guest(v) )
+    {
+        ///
+        /// PV guest
+        ///
+#if CONFIG_PAGING_LEVELS == 4
+        if ( pv_32bit_guest(v) )
+            v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 4, 3);
+        else
+            v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 4, 4);
+#elif CONFIG_PAGING_LEVELS == 3
+        v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 3, 3);
+#elif CONFIG_PAGING_LEVELS == 2
+        v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 2, 2);
+#else
+#error unexpected paging mode
+#endif
+    }
+    else
+    {
+        ///
+        /// HVM guest
+        ///
+        ASSERT(shadow2_mode_translate(d));
+        ASSERT(shadow2_mode_external(d));
+
+        if ( !hvm_paging_enabled(v) )
+        {
+            // paging disabled...
+            clear_bit(_VCPUF_shadow2_translate, &v->vcpu_flags);
+            
+            /* Set v->arch.guest_table to use the p2m map, and choose
+             * the appropriate shadow mode */
+            old_guest_table = pagetable_get_mfn(v->arch.guest_table);
+#if CONFIG_PAGING_LEVELS == 2
+            v->arch.guest_table =
+                pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
+            v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry,2,2);
+#elif CONFIG_PAGING_LEVELS == 3 
+            v->arch.guest_table =
+                pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
+            v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry,3,3);
+#else /* CONFIG_PAGING_LEVELS == 4 */
+            { 
+                l4_pgentry_t *l4e; 
+                /* Use the start of the first l3 table as a PAE l3 */
+                ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
+                l4e = 
sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+                ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
+                v->arch.guest_table =
+                    pagetable_from_pfn(l4e_get_pfn(l4e[0]));
+                sh2_unmap_domain_page(l4e);
+            }
+            v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry,3,3);
+#endif
+            /* Fix up refcounts on guest_table */
+            get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d);
+            if ( mfn_x(old_guest_table) != 0 )
+                put_page(mfn_to_page(old_guest_table));
+        }
+        else
+        {
+            set_bit(_VCPUF_shadow2_translate, &v->vcpu_flags);
+
+#ifdef __x86_64__
+            if ( hvm_long_mode_enabled(v) )
+            {
+                // long mode guest...
+                v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 4, 4);
+            }
+            else
+#endif
+                if ( hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PAE )
+                {
+#if CONFIG_PAGING_LEVELS >= 3
+                    // 32-bit PAE mode guest...
+                    v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 3, 
3);
+#else
+                    SHADOW2_ERROR("PAE not supported in 32-bit Xen\n");
+                    domain_crash(d);
+                    return;
+#endif
+                }
+                else
+                {
+                    // 32-bit 2 level guest...
+#if CONFIG_PAGING_LEVELS >= 3
+                    v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 3, 
2);
+#else
+                    v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 2, 
2);
+#endif
+                }
+        }
+        
+        if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
+        {
+            mfn_t mmfn = shadow2_make_monitor_table(v);
+            v->arch.monitor_table = pagetable_from_mfn(mmfn);
+            v->arch.monitor_vtable = sh2_map_domain_page(mmfn);
+        } 
+
+        if ( v->arch.shadow2 != old_entries )
+        {
+            SHADOW2_PRINTK("new paging mode: d=%u v=%u g=%u s=%u "
+                           "(was g=%u s=%u)\n",
+                           d->domain_id, v->vcpu_id, 
+                           v->arch.shadow2->guest_levels,
+                           v->arch.shadow2->shadow_levels,
+                           old_entries ? old_entries->guest_levels : 0,
+                           old_entries ? old_entries->shadow_levels : 0);
+            if ( old_entries &&
+                 (v->arch.shadow2->shadow_levels !=
+                  old_entries->shadow_levels) )
+            {
+                /* Need to make a new monitor table for the new mode */
+                mfn_t new_mfn, old_mfn;
+
+                if ( v != current ) 
+                {
+                    SHADOW2_ERROR("Some third party (d=%u v=%u) is changing "
+                                  "this HVM vcpu's (d=%u v=%u) paging mode!\n",
+                                  current->domain->domain_id, current->vcpu_id,
+                                  v->domain->domain_id, v->vcpu_id);
+                    domain_crash(v->domain);
+                    return;
+                }
+
+                sh2_unmap_domain_page(v->arch.monitor_vtable);
+                old_mfn = pagetable_get_mfn(v->arch.monitor_table);
+                v->arch.monitor_table = pagetable_null();
+                new_mfn = v->arch.shadow2->make_monitor_table(v);            
+                v->arch.monitor_table = pagetable_from_mfn(new_mfn);
+                v->arch.monitor_vtable = sh2_map_domain_page(new_mfn);
+                SHADOW2_PRINTK("new monitor table %"SH2_PRI_mfn "\n",
+                               mfn_x(new_mfn));
+
+                /* Don't be running on the old monitor table when we 
+                 * pull it down!  Switch CR3, and warn the HVM code that
+                 * its host cr3 has changed. */
+                make_cr3(v, mfn_x(new_mfn));
+                write_ptbase(v);
+                hvm_update_host_cr3(v);
+                old_entries->destroy_monitor_table(v, old_mfn);
+            }
+        }
+
+        // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
+        //        These are HARD: think about the case where two CPU's have
+        //        different values for CR4.PSE and CR4.PGE at the same time.
+        //        This *does* happen, at least for CR4.PGE...
+    }
+
+    v->arch.shadow2->update_cr3(v);
+}
+
+/**************************************************************************/
+/* Turning on and off shadow2 features */
+
+static void sh2_new_mode(struct domain *d, u32 new_mode)
+/* Inform all the vcpus that the shadow mode has been changed */
+{
+    struct vcpu *v;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(d != current->domain);
+    d->arch.shadow2_mode = new_mode;
+    if ( new_mode & SHM2_translate ) 
+        shadow2_audit_p2m(d);
+    for_each_vcpu(d, v)
+        sh2_update_paging_modes(v);
+}
+
+static int shadow2_enable(struct domain *d, u32 mode)
+/* Turn on "permanent" shadow features: external, translate, refcount.
+ * Can only be called once on a domain, and these features cannot be
+ * disabled. 
+ * Returns 0 for success, -errno for failure. */
+{    
+    unsigned int old_pages;
+    int rv = 0;
+
+    domain_pause(d);
+    shadow2_lock(d);
+
+    /* Sanity check the arguments */
+    if ( d == current->domain 
+         || shadow2_mode_enabled(d)
+         || !(mode & SHM2_enable)
+         || ((mode & SHM2_external) && !(mode & SHM2_translate)) )
+    {
+        rv = -EINVAL;
+        goto out;
+    }
+
+    // XXX -- eventually would like to require that all memory be allocated
+    // *after* shadow2_enabled() is called...  So here, we would test to make
+    // sure that d->page_list is empty.
+#if 0
+    spin_lock(&d->page_alloc_lock);
+    if ( !list_empty(&d->page_list) )
+    {
+        spin_unlock(&d->page_alloc_lock);
+        rv = -EINVAL;
+        goto out;
+    }
+    spin_unlock(&d->page_alloc_lock);
+#endif
+
+    /* Init the shadow memory allocation if the user hasn't done so */
+    old_pages = d->arch.shadow2_total_pages;
+    if ( old_pages == 0 )
+        if ( set_sh2_allocation(d, 256, NULL) != 0 ) /* Use at least 1MB */
+        {
+            set_sh2_allocation(d, 0, NULL);
+            rv = -ENOMEM;
+            goto out;
+        }
+
+    /* Init the hash table */
+    if ( shadow2_hash_alloc(d) != 0 )
+    {
+        set_sh2_allocation(d, old_pages, NULL);            
+        rv = -ENOMEM;
+        goto out;
+    }
+
+    /* Init the P2M table */
+    if ( mode & SHM2_translate )
+        if ( !shadow2_alloc_p2m_table(d) )
+        {
+            shadow2_hash_teardown(d);
+            set_sh2_allocation(d, old_pages, NULL);
+            shadow2_p2m_teardown(d);
+            rv = -ENOMEM;
+            goto out;
+        }
+
+    /* Update the bits */
+    sh2_new_mode(d, mode);
+    shadow2_audit_p2m(d);
+ out:
+    shadow2_unlock(d);
+    domain_unpause(d);
+    return 0;
+}
+
+void shadow2_teardown(struct domain *d)
+/* Destroy the shadow pagetables of this domain and free its shadow memory.
+ * Should only be called for dying domains. */
+{
+    struct vcpu *v;
+    mfn_t mfn;
+
+    ASSERT(test_bit(_DOMF_dying, &d->domain_flags));
+    ASSERT(d != current->domain);
+
+    if ( !shadow2_lock_is_acquired(d) )
+        shadow2_lock(d); /* Keep various asserts happy */
+
+    if ( shadow2_mode_enabled(d) )
+    {
+        /* Release the shadow and monitor tables held by each vcpu */
+        for_each_vcpu(d, v)
+        {
+            if ( v->arch.shadow2 )
+                shadow2_detach_old_tables(v);
+            if ( shadow2_mode_external(d) )
+            {
+                mfn = pagetable_get_mfn(v->arch.monitor_table);
+                if ( valid_mfn(mfn) && (mfn_x(mfn) != 0) )
+                    shadow2_destroy_monitor_table(v, mfn);
+                v->arch.monitor_table = pagetable_null();
+            }
+        }
+    }
+
+    if ( d->arch.shadow2_total_pages != 0 )
+    {
+        SHADOW2_PRINTK("teardown of domain %u starts."
+                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
+                       d->domain_id,
+                       d->arch.shadow2_total_pages, 
+                       d->arch.shadow2_free_pages, 
+                       d->arch.shadow2_p2m_pages);
+        /* Destroy all the shadows and release memory to domheap */
+        set_sh2_allocation(d, 0, NULL);
+        /* Release the hash table back to xenheap */
+        if (d->arch.shadow2_hash_table) 
+            shadow2_hash_teardown(d);
+        /* Release the log-dirty bitmap of dirtied pages */
+        sh2_free_log_dirty_bitmap(d);
+        /* Should not have any more memory held */
+        SHADOW2_PRINTK("teardown done."
+                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
+                       d->arch.shadow2_total_pages, 
+                       d->arch.shadow2_free_pages, 
+                       d->arch.shadow2_p2m_pages);
+        ASSERT(d->arch.shadow2_total_pages == 0);
+    }
+
+    /* We leave the "permanent" shadow modes enabled, but clear the
+     * log-dirty mode bit.  We don't want any more mark_dirty()
+     * calls now that we've torn down the bitmap */
+    d->arch.shadow2_mode &= ~SHM2_log_dirty;
+
+    shadow2_unlock(d);
+}
+
+void shadow2_final_teardown(struct domain *d)
+/* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
+{
+
+    SHADOW2_PRINTK("dom %u final teardown starts."
+                   "  Shadow pages total = %u, free = %u, p2m=%u\n",
+                   d->domain_id,
+                   d->arch.shadow2_total_pages, 
+                   d->arch.shadow2_free_pages, 
+                   d->arch.shadow2_p2m_pages);
+
+    /* Double-check that the domain didn't have any shadow memory.  
+     * It is possible for a domain that never got domain_kill()ed
+     * to get here with its shadow allocation intact. */
+    if ( d->arch.shadow2_total_pages != 0 )
+        shadow2_teardown(d);
+
+    /* It is now safe to pull down the p2m map. */
+    if ( d->arch.shadow2_p2m_pages != 0 )
+        shadow2_p2m_teardown(d);
+
+    SHADOW2_PRINTK("dom %u final teardown done."
+                   "  Shadow pages total = %u, free = %u, p2m=%u\n",
+                   d->domain_id,
+                   d->arch.shadow2_total_pages, 
+                   d->arch.shadow2_free_pages, 
+                   d->arch.shadow2_p2m_pages);
+}
+
+static int shadow2_one_bit_enable(struct domain *d, u32 mode)
+/* Turn on a single shadow mode feature */
+{
+    ASSERT(shadow2_lock_is_acquired(d));
+
+    /* Sanity check the call */
+    if ( d == current->domain || (d->arch.shadow2_mode & mode) )
+    {
+        return -EINVAL;
+    }
+
+    if ( d->arch.shadow2_mode == 0 )
+    {
+        /* Init the shadow memory allocation and the hash table */
+        if ( set_sh2_allocation(d, 1, NULL) != 0 
+             || shadow2_hash_alloc(d) != 0 )
+        {
+            set_sh2_allocation(d, 0, NULL);
+            return -ENOMEM;
+        }
+    }
+
+    /* Update the bits */
+    sh2_new_mode(d, d->arch.shadow2_mode | mode);
+
+    return 0;
+}
+
+static int shadow2_one_bit_disable(struct domain *d, u32 mode) 
+/* Turn off a single shadow mode feature */
+{
+    struct vcpu *v;
+    ASSERT(shadow2_lock_is_acquired(d));
+
+    /* Sanity check the call */
+    if ( d == current->domain || !(d->arch.shadow2_mode & mode) )
+    {
+        return -EINVAL;
+    }
+
+    /* Update the bits */
+    sh2_new_mode(d, d->arch.shadow2_mode & ~mode);
+    if ( d->arch.shadow2_mode == 0 )
+    {
+        /* Get this domain off shadows */
+        SHADOW2_PRINTK("un-shadowing of domain %u starts."
+                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
+                       d->domain_id,
+                       d->arch.shadow2_total_pages, 
+                       d->arch.shadow2_free_pages, 
+                       d->arch.shadow2_p2m_pages);
+        for_each_vcpu(d, v)
+        {
+            if ( v->arch.shadow2 )
+                shadow2_detach_old_tables(v);
+#if CONFIG_PAGING_LEVELS == 4
+            if ( !(v->arch.flags & TF_kernel_mode) )
+                make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
+            else
+#endif
+                make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
+
+        }
+
+        /* Pull down the memory allocation */
+        if ( set_sh2_allocation(d, 0, NULL) != 0 )
+        {
+            // XXX - How can this occur?
+            //       Seems like a bug to return an error now that we've
+            //       disabled the relevant shadow mode.
+            //
+            return -ENOMEM;
+        }
+        shadow2_hash_teardown(d);
+        SHADOW2_PRINTK("un-shadowing of domain %u done."
+                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
+                       d->domain_id,
+                       d->arch.shadow2_total_pages, 
+                       d->arch.shadow2_free_pages, 
+                       d->arch.shadow2_p2m_pages);
+    }
+
+    return 0;
+}
+
+/* Enable/disable ops for the "test" and "log-dirty" modes */
+int shadow2_test_enable(struct domain *d)
+{
+    int ret;
+
+    domain_pause(d);
+    shadow2_lock(d);
+
+    if ( shadow2_mode_enabled(d) )
+    {
+        SHADOW2_ERROR("Don't support enabling test mode"
+                      "on already shadowed doms\n");
+        ret = -EINVAL;
+        goto out;
+    }
+
+    ret = shadow2_one_bit_enable(d, SHM2_enable);
+ out:
+    shadow2_unlock(d);
+    domain_unpause(d);
+
+    return ret;
+}
+
+int shadow2_test_disable(struct domain *d)
+{
+    int ret;
+
+    domain_pause(d);
+    shadow2_lock(d);
+    ret = shadow2_one_bit_disable(d, SHM2_enable);
+    shadow2_unlock(d);
+    domain_unpause(d);
+
+    return ret;
+}
+
+static int
+sh2_alloc_log_dirty_bitmap(struct domain *d)
+{
+    ASSERT(d->arch.shadow_dirty_bitmap == NULL);
+    d->arch.shadow_dirty_bitmap_size =
+        (d->shared_info->arch.max_pfn + (BITS_PER_LONG - 1)) &
+        ~(BITS_PER_LONG - 1);
+    d->arch.shadow_dirty_bitmap =
+        xmalloc_array(unsigned long,
+                      d->arch.shadow_dirty_bitmap_size / BITS_PER_LONG);
+    if ( d->arch.shadow_dirty_bitmap == NULL )
+    {
+        d->arch.shadow_dirty_bitmap_size = 0;
+        return -ENOMEM;
+    }
+    memset(d->arch.shadow_dirty_bitmap, 0, d->arch.shadow_dirty_bitmap_size/8);
+
+    return 0;
+}
+
+static void
+sh2_free_log_dirty_bitmap(struct domain *d)
+{
+    d->arch.shadow_dirty_bitmap_size = 0;
+    if ( d->arch.shadow_dirty_bitmap )
+    {
+        xfree(d->arch.shadow_dirty_bitmap);
+        d->arch.shadow_dirty_bitmap = NULL;
+    }
+}
+
+static int shadow2_log_dirty_enable(struct domain *d)
+{
+    int ret;
+
+    domain_pause(d);
+    shadow2_lock(d);
+
+    if ( shadow2_mode_log_dirty(d) )
+    {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    if ( shadow2_mode_enabled(d) )
+    {
+        SHADOW2_ERROR("Don't (yet) support enabling log-dirty"
+                      "on already shadowed doms\n");
+        ret = -EINVAL;
+        goto out;
+    }
+
+    ret = sh2_alloc_log_dirty_bitmap(d);
+    if ( ret != 0 )
+    {
+        sh2_free_log_dirty_bitmap(d);
+        goto out;
+    }
+
+    ret = shadow2_one_bit_enable(d, SHM2_log_dirty);
+    if ( ret != 0 )
+        sh2_free_log_dirty_bitmap(d);
+
+ out:
+    shadow2_unlock(d);
+    domain_unpause(d);
+    return ret;
+}
+
+static int shadow2_log_dirty_disable(struct domain *d)
+{
+    int ret;
+
+    domain_pause(d);
+    shadow2_lock(d);
+    ret = shadow2_one_bit_disable(d, SHM2_log_dirty);
+    if ( !shadow2_mode_log_dirty(d) )
+        sh2_free_log_dirty_bitmap(d);
+    shadow2_unlock(d);
+    domain_unpause(d);
+
+    return ret;
+}
+
+/**************************************************************************/
+/* P2M map manipulations */
+
+static void
+sh2_p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
+{
+    struct vcpu *v;
+
+    if ( !shadow2_mode_translate(d) )
+        return;
+
+    v = current;
+    if ( v->domain != d )
+        v = d->vcpu[0];
+
+
+    SHADOW2_PRINTK("removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
+
+    ASSERT(mfn_x(sh2_gfn_to_mfn(d, gfn)) == mfn);
+    //ASSERT(sh2_mfn_to_gfn(d, mfn) == gfn);
+
+    shadow2_remove_all_shadows_and_parents(v, _mfn(mfn));
+    if ( shadow2_remove_all_mappings(v, _mfn(mfn)) )
+        flush_tlb_mask(d->domain_dirty_cpumask);
+    shadow2_set_p2m_entry(d, gfn, _mfn(INVALID_MFN));
+    set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
+}
+
+void
+shadow2_guest_physmap_remove_page(struct domain *d, unsigned long gfn,
+                                  unsigned long mfn)
+{
+    shadow2_lock(d);
+    shadow2_audit_p2m(d);
+    sh2_p2m_remove_page(d, gfn, mfn);
+    shadow2_audit_p2m(d);
+    shadow2_unlock(d);    
+}
+
+void
+shadow2_guest_physmap_add_page(struct domain *d, unsigned long gfn,
+                               unsigned long mfn)
+{
+    struct vcpu *v;
+    unsigned long ogfn;
+    mfn_t omfn;
+
+    if ( !shadow2_mode_translate(d) )
+        return;
+
+    v = current;
+    if ( v->domain != d )
+        v = d->vcpu[0];
+
+    shadow2_lock(d);
+    shadow2_audit_p2m(d);
+
+    SHADOW2_DEBUG(P2M, "adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
+
+    omfn = sh2_gfn_to_mfn(d, gfn);
+    if ( valid_mfn(omfn) )
+    {
+        /* Get rid of the old mapping, especially any shadows */
+        shadow2_remove_all_shadows_and_parents(v, omfn);
+        if ( shadow2_remove_all_mappings(v, omfn) )
+            flush_tlb_mask(d->domain_dirty_cpumask);
+        set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
+    }        
+
+    ogfn = sh2_mfn_to_gfn(d, _mfn(mfn));
+    if (
+#ifdef __x86_64__
+        (ogfn != 0x5555555555555555L)
+#else
+        (ogfn != 0x55555555L)
+#endif
+        && (ogfn != INVALID_M2P_ENTRY)
+        && (ogfn != gfn) )
+    {
+        /* This machine frame is already mapped at another physical address */
+        SHADOW2_DEBUG(P2M, "aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
+                       mfn, ogfn, gfn);
+        if ( valid_mfn(omfn = sh2_gfn_to_mfn(d, ogfn)) ) 
+        {
+            SHADOW2_DEBUG(P2M, "old gfn=%#lx -> mfn %#lx\n", 
+                           ogfn , mfn_x(omfn));
+            if ( mfn_x(omfn) == mfn ) 
+                sh2_p2m_remove_page(d, ogfn, mfn);
+        }
+    }
+
+    shadow2_set_p2m_entry(d, gfn, _mfn(mfn));
+    set_gpfn_from_mfn(mfn, gfn);
+    shadow2_audit_p2m(d);
+    shadow2_unlock(d);
+}
+
+/**************************************************************************/
+/* Log-dirty mode support */
+
+/* Convert a shadow to log-dirty mode. */
+void shadow2_convert_to_log_dirty(struct vcpu *v, mfn_t smfn)
+{
+    BUG();
+}
+
+
+/* Read a domain's log-dirty bitmap and stats.  
+ * If the operation is a CLEAN, clear the bitmap and stats as well. */
+static int shadow2_log_dirty_op(struct domain *d, dom0_shadow_control_t *sc)
+{    
+    int i, rv = 0, clean = 0;
+
+    domain_pause(d);
+    shadow2_lock(d);
+
+    if ( sc->op == DOM0_SHADOW_CONTROL_OP_CLEAN
+         || sc->op == DOM0_SHADOW_CONTROL_OP_FLUSH ) 
+        clean = 1;
+    else 
+        ASSERT(sc->op == DOM0_SHADOW_CONTROL_OP_PEEK); 
+
+    SHADOW2_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n", 
+                  (clean) ? "clean" : "peek",
+                  d->domain_id,
+                  d->arch.shadow_fault_count, 
+                  d->arch.shadow_dirty_count);
+
+    sc->stats.fault_count = d->arch.shadow_fault_count;
+    sc->stats.dirty_count = d->arch.shadow_dirty_count;    
+        
+    if ( clean ) 
+    {
+        struct list_head *l, *t;
+        struct page_info *pg;
+
+        /* Need to revoke write access to the domain's pages again. 
+         * In future, we'll have a less heavy-handed approach to this, 
+         * but for now, we just unshadow everything except Xen. */
+        list_for_each_safe(l, t, &d->arch.shadow2_toplevel_shadows)
+        {
+            pg = list_entry(l, struct page_info, list);
+            shadow2_unhook_mappings(d->vcpu[0], page_to_mfn(pg));
+        }
+
+        d->arch.shadow_fault_count = 0;
+        d->arch.shadow_dirty_count = 0;
+    }
+
+    if ( guest_handle_is_null(sc->dirty_bitmap) ||
+         (d->arch.shadow_dirty_bitmap == NULL) )
+    {
+        rv = -EINVAL;
+        goto out;
+    }
+ 
+    if ( sc->pages > d->arch.shadow_dirty_bitmap_size )
+        sc->pages = d->arch.shadow_dirty_bitmap_size; 
+
+#define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
+    for ( i = 0; i < sc->pages; i += CHUNK )
+    {
+        int bytes = ((((sc->pages - i) > CHUNK) 
+                      ? CHUNK 
+                      : (sc->pages - i)) + 7) / 8;
+     
+        if ( copy_to_guest_offset(
+                 sc->dirty_bitmap, 
+                 i/(8*sizeof(unsigned long)),
+                 d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
+                 (bytes + sizeof(unsigned long) - 1) / sizeof(unsigned long)) )
+        {
+            rv = -EINVAL;
+            goto out;
+        }
+
+        if ( clean )
+            memset(d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
+                   0, bytes);
+    }
+#undef CHUNK
+
+ out:
+    shadow2_unlock(d);
+    domain_unpause(d);
+    return 0;
+}
+
+
+/* Mark a page as dirty */
+void sh2_do_mark_dirty(struct domain *d, mfn_t gmfn)
+{
+    unsigned long pfn;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(shadow2_mode_log_dirty(d));
+
+    if ( !valid_mfn(gmfn) )
+        return;
+
+    ASSERT(d->arch.shadow_dirty_bitmap != NULL);
+
+    /* We /really/ mean PFN here, even for non-translated guests. */
+    pfn = get_gpfn_from_mfn(mfn_x(gmfn));
+
+    /*
+     * Values with the MSB set denote MFNs that aren't really part of the 
+     * domain's pseudo-physical memory map (e.g., the shared info frame).
+     * Nothing to do here...
+     */
+    if ( unlikely(!VALID_M2P(pfn)) )
+        return;
+
+    /* N.B. Can use non-atomic TAS because protected by shadow2_lock. */
+    if ( likely(pfn < d->arch.shadow_dirty_bitmap_size) ) 
+    { 
+        if ( !__test_and_set_bit(pfn, d->arch.shadow_dirty_bitmap) )
+        {
+            SHADOW2_DEBUG(LOGDIRTY, 
+                          "marked mfn %" SH2_PRI_mfn " (pfn=%lx), dom %d\n",
+                          mfn_x(gmfn), pfn, d->domain_id);
+            d->arch.shadow_dirty_count++;
+        }
+    }
+    else
+    {
+        SHADOW2_PRINTK("mark_dirty OOR! "
+                       "mfn=%" SH2_PRI_mfn " pfn=%lx max=%x (dom %d)\n"
+                       "owner=%d c=%08x t=%" PRtype_info "\n",
+                       mfn_x(gmfn), 
+                       pfn, 
+                       d->arch.shadow_dirty_bitmap_size,
+                       d->domain_id,
+                       (page_get_owner(mfn_to_page(gmfn))
+                        ? page_get_owner(mfn_to_page(gmfn))->domain_id
+                        : -1),
+                       mfn_to_page(gmfn)->count_info, 
+                       mfn_to_page(gmfn)->u.inuse.type_info);
+    }
+}
+
+
+/**************************************************************************/
+/* Shadow-control DOM0_OP dispatcher */
+
+int shadow2_control_op(struct domain *d, 
+                       dom0_shadow_control_t *sc,
+                       XEN_GUEST_HANDLE(dom0_op_t) u_dom0_op)
+{
+    int rc, preempted = 0;
+
+    if ( unlikely(d == current->domain) )
+    {
+        DPRINTK("Don't try to do a shadow op on yourself!\n");
+        return -EINVAL;
+    }
+
+    switch ( sc->op )
+    {
+    case DOM0_SHADOW_CONTROL_OP_OFF:
+        if ( shadow2_mode_log_dirty(d) )
+            if ( (rc = shadow2_log_dirty_disable(d)) != 0 ) 
+                return rc;
+        if ( d->arch.shadow2_mode & SHM2_enable )
+            if ( (rc = shadow2_test_disable(d)) != 0 ) 
+                return rc;
+        return 0;
+
+    case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
+        return shadow2_test_enable(d);
+        
+    case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
+        return shadow2_log_dirty_enable(d);
+        
+    case DOM0_SHADOW_CONTROL_OP_FLUSH:
+    case DOM0_SHADOW_CONTROL_OP_CLEAN:
+    case DOM0_SHADOW_CONTROL_OP_PEEK:
+        return shadow2_log_dirty_op(d, sc);
+
+
+
+    case DOM0_SHADOW2_CONTROL_OP_ENABLE:
+        return shadow2_enable(d, sc->mode << SHM2_shift);        
+
+    case DOM0_SHADOW2_CONTROL_OP_GET_ALLOCATION:
+        sc->mb = shadow2_get_allocation(d);
+        return 0;
+        
+    case DOM0_SHADOW2_CONTROL_OP_SET_ALLOCATION:
+        rc = shadow2_set_allocation(d, sc->mb, &preempted);
+        if ( preempted )
+            /* Not finished.  Set up to re-run the call. */
+            rc = hypercall_create_continuation(
+                __HYPERVISOR_dom0_op, "h", u_dom0_op);
+        else 
+            /* Finished.  Return the new allocation */
+            sc->mb = shadow2_get_allocation(d);
+        return rc;
+        
+        
+    default:
+        SHADOW2_ERROR("Bad shadow op %u\n", sc->op);
+        return -EINVAL;
+    }
+}
+
+
+/**************************************************************************/
+/* Auditing shadow tables */
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL
+
+void shadow2_audit_tables(struct vcpu *v) 
+{
+    /* Dispatch table for getting per-type functions */
+    static hash_callback_t callbacks[16] = {
+        NULL, /* none    */
+#if CONFIG_PAGING_LEVELS == 2
+        SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,2,2),  /* l1_32   */
+        SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,2,2), /* fl1_32  */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,2,2),  /* l2_32   */
+#else 
+        SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,3,2),  /* l1_32   */
+        SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,3,2), /* fl1_32  */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,2),  /* l2_32   */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,3,3),  /* l1_pae  */
+        SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,3,3), /* fl1_pae */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,3),  /* l2_pae  */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,3),  /* l2h_pae */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l3_table,3,3),  /* l3_pae  */
+#if CONFIG_PAGING_LEVELS >= 4
+        SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,4,4),  /* l1_64   */
+        SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,4,4), /* fl1_64  */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,4,4),  /* l2_64   */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l3_table,4,4),  /* l3_64   */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l4_table,4,4),  /* l4_64   */
+#endif /* CONFIG_PAGING_LEVELS >= 4 */
+#endif /* CONFIG_PAGING_LEVELS > 2 */
+        NULL  /* All the rest */
+    };
+    unsigned int mask; 
+
+    if ( !(SHADOW2_AUDIT_ENABLE) )
+        return;
+    
+    if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL )
+        mask = ~1; /* Audit every table in the system */
+    else 
+    {
+        /* Audit only the current mode's tables */
+        switch (v->arch.shadow2->guest_levels)
+        {
+        case 2: mask = (SH2F_L1_32|SH2F_FL1_32|SH2F_L2_32); break;
+        case 3: mask = (SH2F_L1_PAE|SH2F_FL1_PAE|SH2F_L2_PAE
+                        |SH2F_L2H_PAE|SH2F_L3_PAE); break;
+        case 4: mask = (SH2F_L1_64|SH2F_FL1_64|SH2F_L2_64  
+                        |SH2F_L3_64|SH2F_L4_64); break;
+        default: BUG();
+        }
+    }
+
+    hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
+}
+
+#endif /* Shadow audit */
+
+
+/**************************************************************************/
+/* Auditing p2m tables */
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_P2M
+
+void shadow2_audit_p2m(struct domain *d)
+{
+    struct list_head *entry;
+    struct page_info *page;
+    struct domain *od;
+    unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
+    mfn_t p2mfn;
+    unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
+    int test_linear;
+    
+    if ( !(SHADOW2_AUDIT_ENABLE) || !shadow2_mode_translate(d) )
+        return;
+
+    //SHADOW2_PRINTK("p2m audit starts\n");
+
+    test_linear = ( (d == current->domain) && current->arch.monitor_vtable );
+    if ( test_linear )
+        local_flush_tlb(); 
+
+    /* Audit part one: walk the domain's page allocation list, checking 
+     * the m2p entries. */
+    for ( entry = d->page_list.next;
+          entry != &d->page_list;
+          entry = entry->next )
+    {
+        page = list_entry(entry, struct page_info, list);
+        mfn = mfn_x(page_to_mfn(page));
+
+        // SHADOW2_PRINTK("auditing guest page, mfn=%#lx\n", mfn); 
+
+        od = page_get_owner(page);
+
+        if ( od != d ) 
+        {
+            SHADOW2_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n",
+                           mfn, od, (od?od->domain_id:-1), d, d->domain_id);
+            continue;
+        }
+
+        gfn = get_gpfn_from_mfn(mfn);
+        if ( gfn == INVALID_M2P_ENTRY ) 
+        {
+            orphans_i++;
+            //SHADOW2_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n",
+            //               mfn); 
+            continue;
+        }
+
+        if ( gfn == 0x55555555 ) 
+        {
+            orphans_d++;
+            //SHADOW2_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n", 
+            //               mfn); 
+            continue;
+        }
+
+        p2mfn = sh2_gfn_to_mfn_foreign(d, gfn);
+        if ( mfn_x(p2mfn) != mfn )
+        {
+            mpbad++;
+            SHADOW2_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx"
+                           " (-> gfn %#lx)\n",
+                           mfn, gfn, mfn_x(p2mfn),
+                           (mfn_valid(p2mfn)
+                            ? get_gpfn_from_mfn(mfn_x(p2mfn))
+                            : -1u));
+            /* This m2p entry is stale: the domain has another frame in
+             * this physical slot.  No great disaster, but for neatness,
+             * blow away the m2p entry. */ 
+            set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
+        }
+
+        if ( test_linear )
+        {
+            lp2mfn = get_mfn_from_gpfn(gfn);
+            if ( lp2mfn != mfn_x(p2mfn) )
+            {
+                SHADOW2_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
+                               "(!= mfn %#lx)\n", gfn, lp2mfn, p2mfn);
+            }
+        }
+
+        // SHADOW2_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n", 
+        //                mfn, gfn, p2mfn, lp2mfn); 
+    }   
+
+    /* Audit part two: walk the domain's p2m table, checking the entries. */
+    if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
+    {
+        l2_pgentry_t *l2e;
+        l1_pgentry_t *l1e;
+        int i1, i2;
+        
+#if CONFIG_PAGING_LEVELS == 4
+        l4_pgentry_t *l4e;
+        l3_pgentry_t *l3e;
+        int i3, i4;
+        l4e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+#elif CONFIG_PAGING_LEVELS == 3
+        l3_pgentry_t *l3e;
+        int i3;
+        l3e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+#else /* CONFIG_PAGING_LEVELS == 2 */
+        l2e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+#endif
+
+        gfn = 0;
+#if CONFIG_PAGING_LEVELS >= 3
+#if CONFIG_PAGING_LEVELS >= 4
+        for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
+        {
+            if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
+            {
+                gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
+                continue;
+            }
+            l3e = sh2_map_domain_page(_mfn(l4e_get_pfn(l4e[i4])));
+#endif /* now at levels 3 or 4... */
+            for ( i3 = 0; 
+                  i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8); 
+                  i3++ )
+            {
+                if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
+                {
+                    gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
+                    continue;
+                }
+                l2e = sh2_map_domain_page(_mfn(l3e_get_pfn(l3e[i3])));
+#endif /* all levels... */
+                for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
+                {
+                    if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
+                    {
+                        gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
+                        continue;
+                    }
+                    l1e = sh2_map_domain_page(_mfn(l2e_get_pfn(l2e[i2])));
+                    
+                    for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
+                    {
+                        if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
+                            continue;
+                        mfn = l1e_get_pfn(l1e[i1]);
+                        ASSERT(valid_mfn(_mfn(mfn)));
+                        m2pfn = get_gpfn_from_mfn(mfn);
+                        if ( m2pfn != gfn )
+                        {
+                            pmbad++;
+                            SHADOW2_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
+                                           " -> gfn %#lx\n", gfn, mfn, m2pfn);
+                            BUG();
+                        }
+                    }
+                    sh2_unmap_domain_page(l1e);
+                }
+#if CONFIG_PAGING_LEVELS >= 3
+                sh2_unmap_domain_page(l2e);
+            }
+#if CONFIG_PAGING_LEVELS >= 4
+            sh2_unmap_domain_page(l3e);
+        }
+#endif
+#endif
+
+#if CONFIG_PAGING_LEVELS == 4
+        sh2_unmap_domain_page(l4e);
+#elif CONFIG_PAGING_LEVELS == 3
+        sh2_unmap_domain_page(l3e);
+#else /* CONFIG_PAGING_LEVELS == 2 */
+        sh2_unmap_domain_page(l2e);
+#endif
+
+    }
+
+    //SHADOW2_PRINTK("p2m audit complete\n");
+    //if ( orphans_i | orphans_d | mpbad | pmbad ) 
+    //    SHADOW2_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
+    //                   orphans_i + orphans_d, orphans_i, orphans_d,
+    if ( mpbad | pmbad ) 
+        SHADOW2_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
+                       pmbad, mpbad);
+}
+
+#endif /* p2m audit */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End: 
+ */
diff -r fda70200da01 -r 0f917d63e960 xen/arch/x86/shadow2.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/shadow2.c    Wed Aug 16 17:02:35 2006 +0100
@@ -0,0 +1,4469 @@
+/******************************************************************************
+ * arch/x86/shadow2.c
+ *
+ * Simple, mostly-synchronous shadow page tables. 
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+// DESIGN QUESTIONS:
+// Why use subshadows for PAE guests?
+// - reduces pressure in the hash table
+// - reduces shadow size (64-vs-4096 bytes of shadow for 32 bytes of guest L3)
+// - would need to find space in the page_info to store 7 more bits of
+//   backpointer
+// - independent shadows of 32 byte chunks makes it non-obvious how to quickly
+//   figure out when to demote the guest page from l3 status
+//
+// PAE Xen HVM guests are restricted to 8GB of pseudo-physical address space.
+// - Want to map the P2M table into the 16MB RO_MPT hole in Xen's address
+//   space for both PV and HVM guests.
+//
+
+#define SHADOW2 1
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/trace.h>
+#include <xen/sched.h>
+#include <xen/perfc.h>
+#include <xen/domain_page.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/shadow2.h>
+#include <asm/shadow2-private.h>
+#include <asm/shadow2-types.h>
+#include <asm/flushtlb.h>
+#include <asm/hvm/hvm.h>
+
+/* The first cut: an absolutely synchronous, trap-and-emulate version,
+ * supporting only HVM guests (and so only "external" shadow mode). 
+ *
+ * THINGS TO DO LATER:
+ * 
+ * FIX GVA_TO_GPA
+ * The current interface returns an unsigned long, which is not big enough
+ * to hold a physical address in PAE.  Should return a gfn instead.
+ * 
+ * TEARDOWN HEURISTICS
+ * Also: have a heuristic for when to destroy a previous paging-mode's 
+ * shadows.  When a guest is done with its start-of-day 32-bit tables
+ * and reuses the memory we want to drop those shadows.  Start with 
+ * shadows in a page in two modes as a hint, but beware of clever tricks 
+ * like reusing a pagetable for both PAE and 64-bit during boot...
+ *
+ * PAE LINEAR MAPS
+ * Rework shadow_get_l*e() to have the option of using map_domain_page()
+ * instead of linear maps.  Add appropriate unmap_l*e calls in the users. 
+ * Then we can test the speed difference made by linear maps.  If the 
+ * map_domain_page() version is OK on PAE, we could maybe allow a lightweight 
+ * l3-and-l2h-only shadow mode for PAE PV guests that would allow them 
+ * to share l2h pages again. 
+ *
+ * PAE L3 COPYING
+ * In this code, we copy all 32 bytes of a PAE L3 every time we change an 
+ * entry in it, and every time we change CR3.  We copy it for the linear 
+ * mappings (ugh! PAE linear mappings) and we copy it to the low-memory
+ * buffer so it fits in CR3.  Maybe we can avoid some of this recopying 
+ * by using the shadow directly in some places. 
+ * Also, for SMP, need to actually respond to seeing shadow2_pae_flip_pending.
+ *
+ * GUEST_WALK_TABLES TLB FLUSH COALESCE
+ * guest_walk_tables can do up to three remote TLB flushes as it walks to
+ * the first l1 of a new pagetable.  Should coalesce the flushes to the end, 
+ * and if we do flush, re-do the walk.  If anything has changed, then 
+ * pause all the other vcpus and do the walk *again*.
+ *
+ * WP DISABLED
+ * Consider how to implement having the WP bit of CR0 set to 0.  
+ * Since we need to be able to cause write faults to pagetables, this might
+ * end up looking like not having the (guest) pagetables present at all in 
+ * HVM guests...
+ *
+ * PSE disabled / PSE36
+ * We don't support any modes other than PSE enabled, PSE36 disabled.
+ * Neither of those would be hard to change, but we'd need to be able to 
+ * deal with shadows made in one mode and used in another.
+ */
+
+#define FETCH_TYPE_PREFETCH 1
+#define FETCH_TYPE_DEMAND   2
+#define FETCH_TYPE_WRITE    4
+typedef enum {
+    ft_prefetch     = FETCH_TYPE_PREFETCH,
+    ft_demand_read  = FETCH_TYPE_DEMAND,
+    ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
+} fetch_type_t;
+
+#ifndef NDEBUG
+static char *fetch_type_names[] = {
+    [ft_prefetch]     "prefetch",
+    [ft_demand_read]  "demand read",
+    [ft_demand_write] "demand write",
+};
+#endif
+
+/* XXX forward declarations */
+#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+static unsigned long hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, 
int clear_res);
+#endif
+static inline void sh2_update_linear_entries(struct vcpu *v);
+
+/**************************************************************************/
+/* Hash table mapping from guest pagetables to shadows
+ *
+ * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
+ * FL1's:       maps the *gfn* of the start of a superpage to the mfn of a
+ *              shadow L1 which maps its "splinters".
+ * PAE CR3s:    maps the 32-byte aligned, 32-bit CR3 value to the mfn of the
+ *              PAE L3 info page for that CR3 value.
+ */
+
+static inline mfn_t 
+get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
+/* Look for FL1 shadows in the hash table */
+{
+    mfn_t smfn = shadow2_hash_lookup(v, gfn_x(gfn),
+                                     PGC_SH2_fl1_shadow >> PGC_SH2_type_shift);
+
+    if ( unlikely(shadow2_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
+    {
+        struct page_info *page = mfn_to_page(smfn);
+        if ( !(page->count_info & PGC_SH2_log_dirty) )
+            shadow2_convert_to_log_dirty(v, smfn);
+    }
+
+    return smfn;
+}
+
+static inline mfn_t 
+get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
+/* Look for shadows in the hash table */
+{
+    mfn_t smfn = shadow2_hash_lookup(v, mfn_x(gmfn),
+                                     shadow_type >> PGC_SH2_type_shift);
+    perfc_incrc(shadow2_get_shadow_status);
+
+    if ( unlikely(shadow2_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
+    {
+        struct page_info *page = mfn_to_page(smfn);
+        if ( !(page->count_info & PGC_SH2_log_dirty) )
+            shadow2_convert_to_log_dirty(v, smfn);
+    }
+
+    return smfn;
+}
+
+static inline void 
+set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
+/* Put an FL1 shadow into the hash table */
+{
+    SHADOW2_PRINTK("gfn=%"SH2_PRI_gfn", type=%08x, smfn=%05lx\n",
+                   gfn_x(gfn), PGC_SH2_fl1_shadow, mfn_x(smfn));
+
+    if ( unlikely(shadow2_mode_log_dirty(v->domain)) )

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] [xen-unstable] Replace shadow pagetable code with shadow2., Xen patchbot-unstable <=