WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] Check in files I missed from shadow64 checkin.

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] Check in files I missed from shadow64 checkin.
From: Xen patchbot -unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Mon, 11 Jul 2005 05:58:10 -0400
Delivery-date: Mon, 11 Jul 2005 09:58:33 +0000
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User kaf24@xxxxxxxxxxxxxxxxxxxx
# Node ID 0bcfd66a431ebfc70fc068a134e684568ac02966
# Parent  d332d4df452ecf6c3aaeab73c79e1e6ce751b61d

Check in files I missed from shadow64 checkin.

diff -r d332d4df452e -r 0bcfd66a431e xen/arch/x86/shadow_public.c
--- /dev/null   Mon Jul 11 09:22:15 2005
+++ b/xen/arch/x86/shadow_public.c      Mon Jul 11 09:57:38 2005
@@ -0,0 +1,1654 @@
+/******************************************************************************
+ * arch/x86/shadow_public.c
+ * 
+ * Copyright (c) 2005 Michael A Fetterman
+ * Based on an earlier implementation by Ian Pratt et al
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/domain_page.h>
+#include <asm/shadow.h>
+#include <asm/page.h>
+#include <xen/event.h>
+#include <xen/sched.h>
+#include <xen/trace.h>
+
+#if CONFIG_PAGING_LEVELS >= 4 
+#include <asm/shadow_64.h>
+
+extern struct shadow_ops MODE_F_HANDLER;
+#endif
+
+extern struct shadow_ops MODE_A_HANDLER;
+
+/****************************************************************************/
+/************* export interface functions ***********************************/
+/****************************************************************************/
+
+
+int shadow_set_guest_paging_levels(struct domain *d, int levels)
+{
+    shadow_lock(d);
+
+    switch(levels) {
+#if CONFIG_PAGING_LEVELS >= 4 
+    case 4:
+       if ( d->arch.ops != &MODE_F_HANDLER )
+           d->arch.ops = &MODE_F_HANDLER;
+       shadow_unlock(d);
+        return 1;
+#endif
+    case 3:
+    case 2:                     
+       if ( d->arch.ops != &MODE_A_HANDLER )
+           d->arch.ops = &MODE_A_HANDLER;
+       shadow_unlock(d);
+        return 1;
+   default:
+       shadow_unlock(d);
+        return 0;
+    }
+}
+
+void shadow_invlpg(struct vcpu *v, unsigned long va)
+{
+    struct domain *d = current->domain;
+    d->arch.ops->invlpg(v, va);
+}
+
+int shadow_fault(unsigned long va, struct cpu_user_regs *regs)
+{
+    struct domain *d = current->domain;
+    return d->arch.ops->fault(va, regs);
+}
+
+void __update_pagetables(struct vcpu *v)
+{
+    struct domain *d = v->domain;
+    d->arch.ops->update_pagetables(v);
+}
+
+void __shadow_sync_all(struct domain *d)
+{
+    d->arch.ops->sync_all(d);
+}
+    
+int shadow_remove_all_write_access(
+    struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn)
+{
+    return d->arch.ops->remove_all_write_access(d, readonly_gpfn, 
readonly_gmfn);
+}
+
+int shadow_do_update_va_mapping(unsigned long va,
+                                l1_pgentry_t val,
+                                struct vcpu *v)
+{
+    struct domain *d = v->domain;
+    return d->arch.ops->do_update_va_mapping(va, val, v);
+}
+
+struct out_of_sync_entry *
+shadow_mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
+                             unsigned long mfn)
+{
+   struct domain *d = v->domain;
+   return d->arch.ops->mark_mfn_out_of_sync(v, gpfn, mfn);
+}
+
+/*
+ * Returns 1 if va's shadow mapping is out-of-sync.
+ * Returns 0 otherwise.
+ */
+int __shadow_out_of_sync(struct vcpu *v, unsigned long va)
+{
+    struct domain *d = v->domain;
+    return d->arch.ops->is_out_of_sync(v, va);
+}
+
+/****************************************************************************/
+/****************************************************************************/
+#if CONFIG_PAGING_LEVELS >= 4
+/*
+ * Convert PAE 3-level page-table to 4-level page-table
+ */
+#define PDP_ENTRIES   4
+static pagetable_t page_table_convert(struct domain *d)
+{
+    struct pfn_info *l4page, *l3page;
+    l4_pgentry_t *l4;
+    l3_pgentry_t *l3, *pae_l3;
+    int i;
+    
+    l4page = alloc_domheap_page(NULL);
+    if (l4page == NULL)
+        domain_crash();
+    l4 = map_domain_page(page_to_pfn(l4page));
+    memset(l4, 0, PAGE_SIZE);
+
+    l3page = alloc_domheap_page(NULL);
+    if (l3page == NULL)
+        domain_crash();
+    l3 =  map_domain_page(page_to_pfn(l3page));
+    memset(l3, 0, PAGE_SIZE);
+
+    l4[0] = l4e_from_page(l3page, __PAGE_HYPERVISOR);
+    pae_l3 = map_domain_page(pagetable_get_pfn(d->arch.phys_table));
+
+    for (i = 0; i < PDP_ENTRIES; i++) {
+        l3[i] = pae_l3[i];
+        l3e_add_flags(l3[i], 0x67);
+    }
+
+    unmap_domain_page(l4);
+    unmap_domain_page(l3);
+
+    return mk_pagetable(page_to_phys(l4page));
+}
+
+void alloc_monitor_pagetable(struct vcpu *v)
+{
+    unsigned long mmfn;
+    l4_pgentry_t *mpl4e;
+    struct pfn_info *mmfn_info;
+    struct domain *d = v->domain;
+     pagetable_t phys_table;
+
+    ASSERT(!pagetable_get_paddr(v->arch.monitor_table)); /* we should only get 
called once */
+
+    mmfn_info = alloc_domheap_page(NULL);
+    ASSERT( mmfn_info );
+
+    mmfn = (unsigned long) (mmfn_info - frame_table);
+    mpl4e = (l4_pgentry_t *) map_domain_page(mmfn);
+    memcpy(mpl4e, &idle_pg_table[0], PAGE_SIZE);
+    mpl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
+      l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
+    /* map the phys_to_machine map into the per domain Read-Only MPT space */
+    phys_table = page_table_convert(d);
+
+    mpl4e[l4_table_offset(RO_MPT_VIRT_START)] =
+       l4e_from_paddr(pagetable_get_paddr(phys_table),
+         __PAGE_HYPERVISOR);
+    v->arch.monitor_table = mk_pagetable(mmfn << PAGE_SHIFT);
+    v->arch.monitor_vtable = (l2_pgentry_t *) mpl4e;
+}
+
+static void inline
+free_shadow_fl1_table(struct domain *d, unsigned long smfn)
+{
+    l1_pgentry_t *pl1e = map_domain_page(smfn);
+    int i;
+
+    for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
+        put_page_from_l1e(pl1e[i], d);
+}
+
+/*
+ * Free l2, l3, l4 shadow tables
+ */
+static void inline
+free_shadow_tables(struct domain *d, unsigned long smfn, u32 level)
+{
+    pgentry_64_t *ple = map_domain_page(smfn);
+    int i, external = shadow_mode_external(d);
+
+    for ( i = 0; i < PAGETABLE_ENTRIES; i++ )
+        if ( external || is_guest_l4_slot(i) )
+            if ( entry_get_flags(ple[i]) & _PAGE_PRESENT )
+                put_shadow_ref(entry_get_pfn(ple[i]));
+
+    unmap_domain_page(ple);
+}
+
+void free_monitor_pagetable(struct vcpu *v)
+{
+    unsigned long mfn;
+
+//    ASSERT( pagetable_val(v->arch.monitor_table) );
+    /*
+     * free monitor_table.
+     */
+    //mfn = (pagetable_val(v->arch.monitor_table)) >> PAGE_SHIFT;
+    mfn = pagetable_get_pfn(v->arch.monitor_table);
+    unmap_domain_page(v->arch.monitor_vtable);
+    free_domheap_page(&frame_table[mfn]);
+    v->arch.monitor_table = mk_pagetable(0);
+    v->arch.monitor_vtable = 0;
+}
+
+#elif CONFIG_PAGING_LEVELS == 2
+static void alloc_monitor_pagetable(struct vcpu *v)
+{
+    unsigned long mmfn;
+    l2_pgentry_t *mpl2e;
+    struct pfn_info *mmfn_info;
+    struct domain *d = v->domain;
+
+    ASSERT(pagetable_get_paddr(v->arch.monitor_table) == 0);
+
+    mmfn_info = alloc_domheap_page(NULL);
+    ASSERT(mmfn_info != NULL);
+
+    mmfn = page_to_pfn(mmfn_info);
+    mpl2e = (l2_pgentry_t *)map_domain_page(mmfn);
+    memset(mpl2e, 0, PAGE_SIZE);
+
+#ifdef __i386__ /* XXX screws x86/64 build */
+    memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
+           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
+           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
+#endif
+
+    mpl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
+        l2e_from_paddr(__pa(d->arch.mm_perdomain_pt),
+                        __PAGE_HYPERVISOR);
+
+    // map the phys_to_machine map into the Read-Only MPT space for this domain
+    mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
+        l2e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
+                        __PAGE_HYPERVISOR);
+
+    // Don't (yet) have mappings for these...
+    // Don't want to accidentally see the idle_pg_table's linear mapping.
+    //
+    mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
+    mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty();
+
+    v->arch.monitor_table = mk_pagetable(mmfn << PAGE_SHIFT);
+    v->arch.monitor_vtable = mpl2e;
+}
+
+/*
+ * Free the pages for monitor_table and hl2_table
+ */
+void free_monitor_pagetable(struct vcpu *v)
+{
+    l2_pgentry_t *mpl2e, hl2e, sl2e;
+    unsigned long mfn;
+
+    ASSERT( pagetable_get_paddr(v->arch.monitor_table) );
+    
+    mpl2e = v->arch.monitor_vtable;
+
+    /*
+     * First get the mfn for hl2_table by looking at monitor_table
+     */
+    hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
+    if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
+    {
+        mfn = l2e_get_pfn(hl2e);
+        ASSERT(mfn);
+        put_shadow_ref(mfn);
+    }
+
+    sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
+    if ( l2e_get_flags(sl2e) & _PAGE_PRESENT )
+    {
+        mfn = l2e_get_pfn(sl2e);
+        ASSERT(mfn);
+        put_shadow_ref(mfn);
+    }
+
+    unmap_domain_page(mpl2e);
+
+    /*
+     * Then free monitor_table.
+     */
+    mfn = pagetable_get_pfn(v->arch.monitor_table);
+    free_domheap_page(&frame_table[mfn]);
+
+    v->arch.monitor_table = mk_pagetable(0);
+    v->arch.monitor_vtable = 0;
+}
+#endif 
+
+static void
+shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry)
+{
+    void *snapshot;
+
+    if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
+        return;
+
+    // Clear the out_of_sync bit.
+    //
+    clear_bit(_PGC_out_of_sync, &frame_table[entry->gmfn].count_info);
+
+    // XXX Need to think about how to protect the domain's
+    // information less expensively.
+    //
+    snapshot = map_domain_page(entry->snapshot_mfn);
+    memset(snapshot, 0, PAGE_SIZE);
+    unmap_domain_page(snapshot);
+
+    put_shadow_ref(entry->snapshot_mfn);
+}
+
+void
+release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry)
+{
+    struct pfn_info *page;
+
+    page = &frame_table[entry->gmfn];
+        
+    // Decrement ref count of guest & shadow pages
+    //
+    put_page(page);
+
+    // Only use entries that have low bits clear...
+    //
+    if ( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
+    {
+        put_shadow_ref(entry->writable_pl1e >> PAGE_SHIFT);
+        entry->writable_pl1e = -2;
+    }
+    else
+        ASSERT( entry->writable_pl1e == -1 );
+
+    // Free the snapshot
+    //
+    shadow_free_snapshot(d, entry);
+}
+
+static void remove_out_of_sync_entries(struct domain *d, unsigned long gmfn)
+{
+    struct out_of_sync_entry *entry = d->arch.out_of_sync;
+    struct out_of_sync_entry **prev = &d->arch.out_of_sync;
+    struct out_of_sync_entry *found = NULL;
+
+    // NB: Be careful not to call something that manipulates this list
+    //     while walking it.  Collect the results into a separate list
+    //     first, then walk that list.
+    //
+    while ( entry )
+    {
+        if ( entry->gmfn == gmfn )
+        {
+            // remove from out of sync list
+            *prev = entry->next;
+
+            // add to found list
+            entry->next = found;
+            found = entry;
+
+            entry = *prev;
+            continue;
+        }
+        prev = &entry->next;
+        entry = entry->next;
+    }
+
+    prev = NULL;
+    entry = found;
+    while ( entry )
+    {
+        release_out_of_sync_entry(d, entry);
+
+        prev = &entry->next;
+        entry = entry->next;
+    }
+
+    // Add found list to free list
+    if ( prev )
+    {
+        *prev = d->arch.out_of_sync_free;
+        d->arch.out_of_sync_free = found;
+    }
+}
+
+static inline void
+shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn)
+{
+    if ( !shadow_mode_refcounts(d) )
+        return;
+
+    ASSERT(frame_table[gmfn].count_info & PGC_page_table);
+
+    if ( shadow_max_pgtable_type(d, gpfn, NULL) == PGT_none )
+    {
+        clear_bit(_PGC_page_table, &frame_table[gmfn].count_info);
+
+        if ( page_out_of_sync(pfn_to_page(gmfn)) )
+        {
+            remove_out_of_sync_entries(d, gmfn);
+        }
+    }
+}
+
+static void inline
+free_shadow_l1_table(struct domain *d, unsigned long smfn)
+{
+    l1_pgentry_t *pl1e = map_domain_page(smfn);
+    int i;
+    struct pfn_info *spage = pfn_to_page(smfn);
+    u32 min_max = spage->tlbflush_timestamp;
+    int min = SHADOW_MIN(min_max);
+    int max = SHADOW_MAX(min_max);
+
+    for ( i = min; i <= max; i++ )
+    {
+        shadow_put_page_from_l1e(pl1e[i], d);
+        pl1e[i] = l1e_empty();
+    }
+
+    unmap_domain_page(pl1e);
+}
+
+static void inline
+free_shadow_hl2_table(struct domain *d, unsigned long smfn)
+{
+    l1_pgentry_t *hl2 = map_domain_page(smfn);
+    int i, limit;
+
+    SH_VVLOG("%s: smfn=%lx freed", __func__, smfn);
+
+#ifdef __i386__
+    if ( shadow_mode_external(d) )
+        limit = L2_PAGETABLE_ENTRIES;
+    else
+        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
+#else
+    limit = 0; /* XXX x86/64 XXX */
+#endif
+
+    for ( i = 0; i < limit; i++ )
+    {
+        if ( l1e_get_flags(hl2[i]) & _PAGE_PRESENT )
+            put_page(pfn_to_page(l1e_get_pfn(hl2[i])));
+    }
+
+    unmap_domain_page(hl2);
+}
+
+static void inline
+free_shadow_l2_table(struct domain *d, unsigned long smfn, unsigned int type)
+{
+    l2_pgentry_t *pl2e = map_domain_page(smfn);
+    int i, external = shadow_mode_external(d);
+
+    for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
+        if ( external || is_guest_l2_slot(type, i) )
+            if ( l2e_get_flags(pl2e[i]) & _PAGE_PRESENT )
+                put_shadow_ref(l2e_get_pfn(pl2e[i]));
+
+    if ( (PGT_base_page_table == PGT_l2_page_table) &&
+         shadow_mode_translate(d) && !external )
+    {
+        // free the ref to the hl2
+        //
+        
put_shadow_ref(l2e_get_pfn(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)]));
+    }
+
+    unmap_domain_page(pl2e);
+}
+
+void free_shadow_page(unsigned long smfn)
+{
+    struct pfn_info *page = &frame_table[smfn];
+    unsigned long gmfn = page->u.inuse.type_info & PGT_mfn_mask;
+    struct domain *d = page_get_owner(pfn_to_page(gmfn));
+    unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
+    unsigned long type = page->u.inuse.type_info & PGT_type_mask;
+
+    SH_VVLOG("%s: free'ing smfn=%lx", __func__, smfn);
+
+    ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
+#if CONFIG_PAGING_LEVELS >=4
+    if (type == PGT_fl1_shadow) {
+        unsigned long mfn;
+        mfn = __shadow_status(d, gpfn, PGT_fl1_shadow);
+        if (!mfn)
+            gpfn |= (1UL << 63);
+    }
+#endif
+    delete_shadow_status(d, gpfn, gmfn, type);
+
+    switch ( type )
+    {
+    case PGT_l1_shadow:
+        perfc_decr(shadow_l1_pages);
+        shadow_demote(d, gpfn, gmfn);
+        free_shadow_l1_table(d, smfn);
+        break;
+#if defined (__i386__)
+    case PGT_l2_shadow:
+        perfc_decr(shadow_l2_pages);
+        shadow_demote(d, gpfn, gmfn);
+        free_shadow_l2_table(d, smfn, page->u.inuse.type_info);
+        break;
+
+    case PGT_hl2_shadow:
+        perfc_decr(hl2_table_pages);
+        shadow_demote(d, gpfn, gmfn);
+        free_shadow_hl2_table(d, smfn);
+        break;
+#else
+    case PGT_l2_shadow:
+    case PGT_l3_shadow:
+    case PGT_l4_shadow:
+        shadow_demote(d, gpfn, gmfn);
+        free_shadow_tables(d, smfn, shadow_type_to_level(type));
+        break;
+
+    case PGT_fl1_shadow:
+        free_shadow_fl1_table(d, smfn);
+        break;
+
+#endif
+
+    case PGT_snapshot:
+        perfc_decr(apshot_pages);
+        break;
+
+    default:
+        printk("Free shadow weird page type mfn=%lx type=%08x\n",
+               page_to_pfn(page), page->u.inuse.type_info);
+        break;
+    }
+
+    d->arch.shadow_page_count--;
+
+    // No TLB flushes are needed the next time this page gets allocated.
+    //
+    page->tlbflush_timestamp = 0;
+    page->u.free.cpumask     = CPU_MASK_NONE;
+
+    if ( type == PGT_l1_shadow )
+    {
+        list_add(&page->list, &d->arch.free_shadow_frames);
+        perfc_incr(free_l1_pages);
+    }
+    else
+        free_domheap_page(page);
+}
+
+static void
+free_writable_pte_predictions(struct domain *d)
+{
+    int i;
+    struct shadow_status *x;
+
+    for ( i = 0; i < shadow_ht_buckets; i++ )
+    {
+        u32 count;
+        unsigned long *gpfn_list;
+
+        /* Skip empty buckets. */
+        if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
+            continue;
+
+        count = 0;
+        for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
+            if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
+                count++;
+
+        gpfn_list = xmalloc_array(unsigned long, count);
+        count = 0;
+        for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
+            if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
+                gpfn_list[count++] = x->gpfn_and_flags & PGT_mfn_mask;
+
+        while ( count )
+        {
+            count--;
+            delete_shadow_status(d, gpfn_list[count], 0, PGT_writable_pred);
+        }
+
+        xfree(gpfn_list);
+    }
+}
+
+static void free_shadow_ht_entries(struct domain *d)
+{
+    struct shadow_status *x, *n;
+
+    SH_VLOG("freed tables count=%d l1=%d l2=%d",
+            d->arch.shadow_page_count, perfc_value(shadow_l1_pages), 
+            perfc_value(shadow_l2_pages));
+
+    n = d->arch.shadow_ht_extras;
+    while ( (x = n) != NULL )
+    {
+        d->arch.shadow_extras_count--;
+        n = *((struct shadow_status **)(&x[shadow_ht_extra_size]));
+        xfree(x);
+    }
+
+    d->arch.shadow_ht_extras = NULL;
+    d->arch.shadow_ht_free = NULL;
+
+    ASSERT(d->arch.shadow_extras_count == 0);
+    SH_LOG("freed extras, now %d", d->arch.shadow_extras_count);
+
+    if ( d->arch.shadow_dirty_bitmap != NULL )
+    {
+        xfree(d->arch.shadow_dirty_bitmap);
+        d->arch.shadow_dirty_bitmap = 0;
+        d->arch.shadow_dirty_bitmap_size = 0;
+    }
+
+    xfree(d->arch.shadow_ht);
+    d->arch.shadow_ht = NULL;
+}
+
+static void free_out_of_sync_entries(struct domain *d)
+{
+    struct out_of_sync_entry *x, *n;
+
+    n = d->arch.out_of_sync_extras;
+    while ( (x = n) != NULL )
+    {
+        d->arch.out_of_sync_extras_count--;
+        n = *((struct out_of_sync_entry **)(&x[out_of_sync_extra_size]));
+        xfree(x);
+    }
+
+    d->arch.out_of_sync_extras = NULL;
+    d->arch.out_of_sync_free = NULL;
+    d->arch.out_of_sync = NULL;
+
+    ASSERT(d->arch.out_of_sync_extras_count == 0);
+    FSH_LOG("freed extra out_of_sync entries, now %d",
+            d->arch.out_of_sync_extras_count);
+}
+
+void free_shadow_pages(struct domain *d)
+{
+    int                   i;
+    struct shadow_status *x;
+    struct vcpu          *v;
+ 
+    /*
+     * WARNING! The shadow page table must not currently be in use!
+     * e.g., You are expected to have paused the domain and synchronized CR3.
+     */
+
+    if( !d->arch.shadow_ht ) return;
+
+    shadow_audit(d, 1);
+
+    // first, remove any outstanding refs from out_of_sync entries...
+    //
+    free_out_of_sync_state(d);
+
+    // second, remove any outstanding refs from v->arch.shadow_table
+    // and CR3.
+    //
+    for_each_vcpu(d, v)
+    {
+        if ( pagetable_get_paddr(v->arch.shadow_table) )
+        {
+            put_shadow_ref(pagetable_get_pfn(v->arch.shadow_table));
+            v->arch.shadow_table = mk_pagetable(0);
+        }
+
+        if ( v->arch.monitor_shadow_ref )
+        {
+            put_shadow_ref(v->arch.monitor_shadow_ref);
+            v->arch.monitor_shadow_ref = 0;
+        }
+    }
+
+#if defined (__i386__)
+    // For external shadows, remove the monitor table's refs
+    //
+    if ( shadow_mode_external(d) )
+    {
+        for_each_vcpu(d, v)
+        {
+            l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
+
+            if ( mpl2e )
+            {
+                l2_pgentry_t hl2e = 
mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
+                l2_pgentry_t smfn = 
mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
+
+                if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
+                {
+                    put_shadow_ref(l2e_get_pfn(hl2e));
+                    mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
+                }
+                if ( l2e_get_flags(smfn) & _PAGE_PRESENT )
+                {
+                    put_shadow_ref(l2e_get_pfn(smfn));
+                    mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = 
l2e_empty();
+                }
+            }
+        }
+    }
+#endif
+    // Now, the only refs to shadow pages that are left are from the shadow
+    // pages themselves.  We just unpin the pinned pages, and the rest
+    // should automatically disappear.
+    //
+    // NB: Beware: each explicitly or implicit call to free_shadow_page
+    // can/will result in the hash bucket getting rewritten out from
+    // under us...  First, collect the list of pinned pages, then
+    // free them.
+    //
+    for ( i = 0; i < shadow_ht_buckets; i++ )
+    {
+        u32 count;
+        unsigned long *mfn_list;
+
+        /* Skip empty buckets. */
+        if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
+            continue;
+
+        count = 0;
+        for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
+            if ( MFN_PINNED(x->smfn) )
+                count++;
+        if ( !count )
+            continue;
+
+        mfn_list = xmalloc_array(unsigned long, count);
+        count = 0;
+        for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
+            if ( MFN_PINNED(x->smfn) )
+                mfn_list[count++] = x->smfn;
+
+        while ( count )
+        {
+            shadow_unpin(mfn_list[--count]);
+        }
+        xfree(mfn_list);
+    }
+
+    // Now free the pre-zero'ed pages from the domain
+    //
+    struct list_head *list_ent, *tmp;
+    list_for_each_safe(list_ent, tmp, &d->arch.free_shadow_frames)
+    {
+        list_del(list_ent);
+        perfc_decr(free_l1_pages);
+
+        struct pfn_info *page = list_entry(list_ent, struct pfn_info, list);
+        free_domheap_page(page);
+    }
+
+    shadow_audit(d, 0);
+
+    SH_LOG("Free shadow table.");
+}
+
+void __shadow_mode_disable(struct domain *d)
+{
+    if ( unlikely(!shadow_mode_enabled(d)) )
+        return;
+
+    /*
+     * Currently this does not fix up page ref counts, so it is valid to call
+     * only when a domain is being destroyed.
+     */
+    BUG_ON(!test_bit(_DOMF_dying, &d->domain_flags) &&
+           shadow_mode_refcounts(d));
+    d->arch.shadow_tainted_refcnts = shadow_mode_refcounts(d);
+
+    free_shadow_pages(d);
+    free_writable_pte_predictions(d);
+
+#ifndef NDEBUG
+    int i;
+    for ( i = 0; i < shadow_ht_buckets; i++ )
+    {
+        if ( d->arch.shadow_ht[i].gpfn_and_flags != 0 )
+        {
+            printk("%s: d->arch.shadow_ht[%x].gpfn_and_flags=%lx\n",
+                   __FILE__, i, d->arch.shadow_ht[i].gpfn_and_flags);
+            BUG();
+        }
+    }
+#endif
+
+    d->arch.shadow_mode = 0;
+
+    free_shadow_ht_entries(d);
+    free_out_of_sync_entries(d);
+
+    struct vcpu *v;
+    for_each_vcpu(d, v)
+    {
+        update_pagetables(v);
+    }
+}
+
+
+static void
+free_p2m_table(struct domain *d)
+{
+    // uh, this needs some work...  :)
+    BUG();
+}
+
+
+int __shadow_mode_enable(struct domain *d, unsigned int mode)
+{
+    struct vcpu *v;
+    int new_modes = (mode & ~d->arch.shadow_mode);
+
+    // Gotta be adding something to call this function.
+    ASSERT(new_modes);
+
+    // can't take anything away by calling this function.
+    ASSERT(!(d->arch.shadow_mode & ~mode));
+
+#if defined(CONFIG_PAGING_LEVELS)
+    if(!shadow_set_guest_paging_levels(d, 
+          CONFIG_PAGING_LEVELS)) {
+       printk("Unsupported guest paging levels\n");
+       domain_crash_synchronous(); /* need to take a clean path */
+    }
+#endif
+
+    for_each_vcpu(d, v)
+    {
+        invalidate_shadow_ldt(v);
+
+        // We need to set these up for __update_pagetables().
+        // See the comment there.
+
+        /*
+         * arch.guest_vtable
+         */
+        if ( v->arch.guest_vtable &&
+             (v->arch.guest_vtable != __linear_l2_table) )
+        {
+            unmap_domain_page(v->arch.guest_vtable);
+        }
+        if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
+            v->arch.guest_vtable = __linear_l2_table;
+        else
+            v->arch.guest_vtable = NULL;
+
+        /*
+         * arch.shadow_vtable
+         */
+        if ( v->arch.shadow_vtable &&
+             (v->arch.shadow_vtable != __shadow_linear_l2_table) )
+        {
+            unmap_domain_page(v->arch.shadow_vtable);
+        }
+        if ( !(mode & SHM_external) && d->arch.ops->guest_paging_levels == 2)
+            v->arch.shadow_vtable = __shadow_linear_l2_table;
+        else
+            v->arch.shadow_vtable = NULL;
+        
+#if defined (__i386__)
+        /*
+         * arch.hl2_vtable
+         */
+        if ( v->arch.hl2_vtable &&
+             (v->arch.hl2_vtable != __linear_hl2_table) )
+        {
+            unmap_domain_page(v->arch.hl2_vtable);
+        }
+        if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
+            v->arch.hl2_vtable = __linear_hl2_table;
+        else
+            v->arch.hl2_vtable = NULL;
+#endif
+        /*
+         * arch.monitor_table & arch.monitor_vtable
+         */
+        if ( v->arch.monitor_vtable )
+        {
+            free_monitor_pagetable(v);
+        }
+        if ( mode & SHM_external )
+        {
+            alloc_monitor_pagetable(v);
+        }
+    }
+
+    if ( new_modes & SHM_enable )
+    {
+        ASSERT( !d->arch.shadow_ht );
+        d->arch.shadow_ht = xmalloc_array(struct shadow_status, 
shadow_ht_buckets);
+        if ( d->arch.shadow_ht == NULL )
+            goto nomem;
+
+        memset(d->arch.shadow_ht, 0,
+           shadow_ht_buckets * sizeof(struct shadow_status));
+    }
+
+    if ( new_modes & SHM_log_dirty )
+    {
+        ASSERT( !d->arch.shadow_dirty_bitmap );
+        d->arch.shadow_dirty_bitmap_size = (d->max_pages + 63) & ~63;
+        d->arch.shadow_dirty_bitmap = 
+            xmalloc_array(unsigned long, d->arch.shadow_dirty_bitmap_size /
+                                         (8 * sizeof(unsigned long)));
+        if ( d->arch.shadow_dirty_bitmap == NULL )
+        {
+            d->arch.shadow_dirty_bitmap_size = 0;
+            goto nomem;
+        }
+        memset(d->arch.shadow_dirty_bitmap, 0, 
+               d->arch.shadow_dirty_bitmap_size/8);
+    }
+
+    if ( new_modes & SHM_translate )
+    {
+        if ( !(new_modes & SHM_external) )
+        {
+            ASSERT( !pagetable_get_paddr(d->arch.phys_table) );
+            if ( !alloc_p2m_table(d) )
+            {
+                printk("alloc_p2m_table failed (out-of-memory?)\n");
+                goto nomem;
+            }
+        }
+        else
+        {
+            // external guests provide their own memory for their P2M maps.
+            //
+            ASSERT( d == page_get_owner(
+                        &frame_table[pagetable_get_pfn(d->arch.phys_table)]) );
+        }
+    }
+
+    printk("audit1\n");
+    _audit_domain(d, AUDIT_SHADOW_ALREADY_LOCKED | AUDIT_ERRORS_OK);
+    printk("audit1 done\n");
+
+    // Get rid of any shadow pages from any previous shadow mode.
+    //
+    free_shadow_pages(d);
+
+    printk("audit2\n");
+    _audit_domain(d, AUDIT_SHADOW_ALREADY_LOCKED | AUDIT_ERRORS_OK);
+    printk("audit2 done\n");
+
+    /*
+     * Tear down it's counts by disassembling its page-table-based ref counts.
+     * Also remove CR3's gcount/tcount.
+     * That leaves things like GDTs and LDTs and external refs in tact.
+     *
+     * Most pages will be writable tcount=0.
+     * Some will still be L1 tcount=0 or L2 tcount=0.
+     * Maybe some pages will be type none tcount=0.
+     * Pages granted external writable refs (via grant tables?) will
+     * still have a non-zero tcount.  That's OK.
+     *
+     * gcounts will generally be 1 for PGC_allocated.
+     * GDTs and LDTs will have additional gcounts.
+     * Any grant-table based refs will still be in the gcount.
+     *
+     * We attempt to grab writable refs to each page (thus setting its type).
+     * Immediately put back those type refs.
+     *
+     * Assert that no pages are left with L1/L2/L3/L4 type.
+     */
+    audit_adjust_pgtables(d, -1, 1);
+
+    d->arch.shadow_mode = mode;
+
+    if ( shadow_mode_refcounts(d) )
+    {
+        struct list_head *list_ent = d->page_list.next;
+        while ( list_ent != &d->page_list )
+        {
+            struct pfn_info *page = list_entry(list_ent, struct pfn_info, 
list);
+            if ( !get_page_type(page, PGT_writable_page) )
+                BUG();
+            put_page_type(page);
+
+            list_ent = page->list.next;
+        }
+    }
+
+    audit_adjust_pgtables(d, 1, 1);
+
+    printk("audit3\n");
+    _audit_domain(d, AUDIT_SHADOW_ALREADY_LOCKED | AUDIT_ERRORS_OK);
+    printk("audit3 done\n");
+
+    return 0;
+
+ nomem:
+    if ( (new_modes & SHM_enable) )
+    {
+        xfree(d->arch.shadow_ht);
+        d->arch.shadow_ht = NULL;
+    }
+    if ( (new_modes & SHM_log_dirty) )
+    {
+        xfree(d->arch.shadow_dirty_bitmap);
+        d->arch.shadow_dirty_bitmap = NULL;
+    }
+    if ( (new_modes & SHM_translate) && !(new_modes & SHM_external) &&
+         pagetable_get_paddr(d->arch.phys_table) )
+    {
+        free_p2m_table(d);
+    }
+    return -ENOMEM;
+}
+
+
+int shadow_mode_enable(struct domain *d, unsigned int mode)
+{
+    int rc;
+    shadow_lock(d);
+    rc = __shadow_mode_enable(d, mode);
+    shadow_unlock(d);
+    return rc;
+}
+
+static int shadow_mode_table_op(
+    struct domain *d, dom0_shadow_control_t *sc)
+{
+    unsigned int      op = sc->op;
+    int               i, rc = 0;
+    struct vcpu *v;
+
+    ASSERT(shadow_lock_is_acquired(d));
+
+    SH_VLOG("shadow mode table op %lx %lx count %d",
+            (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.guest_table),  
/* XXX SMP */
+            (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.shadow_table), 
/* XXX SMP */
+            d->arch.shadow_page_count);
+
+    shadow_audit(d, 1);
+
+    switch ( op )
+    {
+    case DOM0_SHADOW_CONTROL_OP_FLUSH:
+        free_shadow_pages(d);
+
+        d->arch.shadow_fault_count       = 0;
+        d->arch.shadow_dirty_count       = 0;
+        d->arch.shadow_dirty_net_count   = 0;
+        d->arch.shadow_dirty_block_count = 0;
+
+        break;
+   
+    case DOM0_SHADOW_CONTROL_OP_CLEAN:
+        free_shadow_pages(d);
+
+        sc->stats.fault_count       = d->arch.shadow_fault_count;
+        sc->stats.dirty_count       = d->arch.shadow_dirty_count;
+        sc->stats.dirty_net_count   = d->arch.shadow_dirty_net_count;
+        sc->stats.dirty_block_count = d->arch.shadow_dirty_block_count;
+
+        d->arch.shadow_fault_count       = 0;
+        d->arch.shadow_dirty_count       = 0;
+        d->arch.shadow_dirty_net_count   = 0;
+        d->arch.shadow_dirty_block_count = 0;
+ 
+        if ( (d->max_pages > sc->pages) || 
+             (sc->dirty_bitmap == NULL) || 
+             (d->arch.shadow_dirty_bitmap == NULL) )
+        {
+            rc = -EINVAL;
+            break;
+        }
+ 
+        sc->pages = d->max_pages;
+
+#define chunk (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
+        for ( i = 0; i < d->max_pages; i += chunk )
+        {
+            int bytes = ((((d->max_pages - i) > chunk) ?
+                          chunk : (d->max_pages - i)) + 7) / 8;
+
+            if (copy_to_user(
+                    sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
+                    d->arch.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
+                    bytes))
+            {
+                // copy_to_user can fail when copying to guest app memory.
+                // app should zero buffer after mallocing, and pin it
+                rc = -EINVAL;
+                memset(
+                    d->arch.shadow_dirty_bitmap + 
+                    (i/(8*sizeof(unsigned long))),
+                    0, (d->max_pages/8) - (i/(8*sizeof(unsigned long))));
+                break;
+            }
+            memset(
+                d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
+                0, bytes);
+        }
+
+        break;
+
+    case DOM0_SHADOW_CONTROL_OP_PEEK:
+        sc->stats.fault_count       = d->arch.shadow_fault_count;
+        sc->stats.dirty_count       = d->arch.shadow_dirty_count;
+        sc->stats.dirty_net_count   = d->arch.shadow_dirty_net_count;
+        sc->stats.dirty_block_count = d->arch.shadow_dirty_block_count;
+ 
+        if ( (d->max_pages > sc->pages) || 
+             (sc->dirty_bitmap == NULL) || 
+             (d->arch.shadow_dirty_bitmap == NULL) )
+        {
+            rc = -EINVAL;
+            break;
+        }
+ 
+        sc->pages = d->max_pages;
+        if (copy_to_user(
+            sc->dirty_bitmap, d->arch.shadow_dirty_bitmap, (d->max_pages+7)/8))
+        {
+            rc = -EINVAL;
+            break;
+        }
+
+        break;
+
+    default:
+        rc = -EINVAL;
+        break;
+    }
+
+    SH_VLOG("shadow mode table op : page count %d", d->arch.shadow_page_count);
+    shadow_audit(d, 1);
+
+    for_each_vcpu(d,v)
+        __update_pagetables(v);
+
+    return rc;
+}
+
+int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
+{
+    unsigned int op = sc->op;
+    int          rc = 0;
+    struct vcpu *v;
+
+    if ( unlikely(d == current->domain) )
+    {
+        DPRINTK("Don't try to do a shadow op on yourself!\n");
+        return -EINVAL;
+    }   
+
+    domain_pause(d);
+
+    shadow_lock(d);
+
+    switch ( op )
+    {
+    case DOM0_SHADOW_CONTROL_OP_OFF:
+        __shadow_sync_all(d);
+        __shadow_mode_disable(d);
+        break;
+
+    case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
+        free_shadow_pages(d);
+        rc = __shadow_mode_enable(d, SHM_enable);
+        break;
+
+    case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
+        free_shadow_pages(d);
+        rc = __shadow_mode_enable(
+            d, d->arch.shadow_mode|SHM_enable|SHM_log_dirty);
+        break;
+
+    case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE:
+        free_shadow_pages(d);
+        rc = __shadow_mode_enable(
+            d, d->arch.shadow_mode|SHM_enable|SHM_refcounts|SHM_translate);
+        break;
+
+    default:
+        rc = shadow_mode_enabled(d) ? shadow_mode_table_op(d, sc) : -EINVAL;
+        break;
+    }
+
+    shadow_unlock(d);
+
+    for_each_vcpu(d,v)
+        update_pagetables(v);
+
+    domain_unpause(d);
+
+    return rc;
+}
+
+void shadow_mode_init(void)
+{
+}
+
+int _shadow_mode_refcounts(struct domain *d)
+{
+    return shadow_mode_refcounts(d);
+}
+
+int
+set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn,
+              struct domain_mmap_cache *l2cache,
+              struct domain_mmap_cache *l1cache)
+{
+    unsigned long tabpfn = pagetable_get_pfn(d->arch.phys_table);
+    l2_pgentry_t *l2, l2e;
+    l1_pgentry_t *l1;
+    struct pfn_info *l1page;
+    unsigned long va = pfn << PAGE_SHIFT;
+
+    ASSERT(tabpfn != 0);
+
+    l2 = map_domain_page_with_cache(tabpfn, l2cache);
+    l2e = l2[l2_table_offset(va)];
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
+    {
+        l1page = alloc_domheap_page(NULL);
+        if ( !l1page )
+        {
+            unmap_domain_page_with_cache(l2, l2cache);
+            return 0;
+        }
+
+        l1 = map_domain_page_with_cache(page_to_pfn(l1page), l1cache);
+        memset(l1, 0, PAGE_SIZE);
+        unmap_domain_page_with_cache(l1, l1cache);
+
+        l2e = l2e_from_page(l1page, __PAGE_HYPERVISOR);
+        l2[l2_table_offset(va)] = l2e;
+    }
+    unmap_domain_page_with_cache(l2, l2cache);
+
+    l1 = map_domain_page_with_cache(l2e_get_pfn(l2e), l1cache);
+    l1[l1_table_offset(va)] = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
+    unmap_domain_page_with_cache(l1, l1cache);
+
+    return 1;
+}
+
+int
+alloc_p2m_table(struct domain *d)
+{
+    struct list_head *list_ent;
+    struct pfn_info *page, *l2page;
+    l2_pgentry_t *l2;
+    unsigned long mfn, pfn;
+    struct domain_mmap_cache l1cache, l2cache;
+
+    l2page = alloc_domheap_page(NULL);
+    if ( l2page == NULL )
+        return 0;
+
+    domain_mmap_cache_init(&l1cache);
+    domain_mmap_cache_init(&l2cache);
+
+    d->arch.phys_table = mk_pagetable(page_to_phys(l2page));
+    l2 = map_domain_page_with_cache(page_to_pfn(l2page), &l2cache);
+    memset(l2, 0, PAGE_SIZE);
+    unmap_domain_page_with_cache(l2, &l2cache);
+
+    list_ent = d->page_list.next;
+    while ( list_ent != &d->page_list )
+    {
+        page = list_entry(list_ent, struct pfn_info, list);
+        mfn = page_to_pfn(page);
+        pfn = machine_to_phys_mapping[mfn];
+        ASSERT(pfn != INVALID_M2P_ENTRY);
+        ASSERT(pfn < (1u<<20));
+
+        set_p2m_entry(d, pfn, mfn, &l2cache, &l1cache);
+
+        list_ent = page->list.next;
+    }
+
+    list_ent = d->xenpage_list.next;
+    while ( list_ent != &d->xenpage_list )
+    {
+        page = list_entry(list_ent, struct pfn_info, list);
+        mfn = page_to_pfn(page);
+        pfn = machine_to_phys_mapping[mfn];
+        if ( (pfn != INVALID_M2P_ENTRY) &&
+             (pfn < (1u<<20)) )
+        {
+            set_p2m_entry(d, pfn, mfn, &l2cache, &l1cache);
+        }
+
+        list_ent = page->list.next;
+    }
+
+    domain_mmap_cache_destroy(&l2cache);
+    domain_mmap_cache_destroy(&l1cache);
+
+    return 1;
+}
+
+void shadow_l1_normal_pt_update(
+    struct domain *d,
+    unsigned long pa, l1_pgentry_t gpte,
+    struct domain_mmap_cache *cache)
+{
+    unsigned long sl1mfn;    
+    l1_pgentry_t *spl1e, spte;
+
+    shadow_lock(d);
+
+    sl1mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l1_shadow);
+    if ( sl1mfn )
+    {
+        SH_VVLOG("shadow_l1_normal_pt_update pa=%p, gpte=%" PRIpte,
+                 (void *)pa, l1e_get_intpte(gpte));
+        l1pte_propagate_from_guest(current->domain, gpte, &spte);
+
+        spl1e = map_domain_page_with_cache(sl1mfn, cache);
+        spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = spte;
+        unmap_domain_page_with_cache(spl1e, cache);
+    }
+
+    shadow_unlock(d);
+}
+
+void shadow_l2_normal_pt_update(
+    struct domain *d,
+    unsigned long pa, l2_pgentry_t gpde,
+    struct domain_mmap_cache *cache)
+{
+    unsigned long sl2mfn;
+    l2_pgentry_t *spl2e;
+
+    shadow_lock(d);
+
+    sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l2_shadow);
+    if ( sl2mfn )
+    {
+        SH_VVLOG("shadow_l2_normal_pt_update pa=%p, gpde=%" PRIpte,
+                 (void *)pa, l2e_get_intpte(gpde));
+        spl2e = map_domain_page_with_cache(sl2mfn, cache);
+        validate_pde_change(d, gpde,
+                            &spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)]);
+        unmap_domain_page_with_cache(spl2e, cache);
+    }
+
+    shadow_unlock(d);
+}
+
+#if CONFIG_PAGING_LEVELS >= 3
+void shadow_l3_normal_pt_update(
+    struct domain *d,
+    unsigned long pa, l3_pgentry_t gpde,
+    struct domain_mmap_cache *cache)
+{
+    unsigned long sl3mfn;
+    pgentry_64_t *spl3e;
+
+    shadow_lock(d);
+
+    sl3mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l3_shadow);
+    if ( sl3mfn )
+    {
+        SH_VVLOG("shadow_l3_normal_pt_update pa=%p, gpde=%" PRIpte,
+                 (void *)pa, l3e_get_intpte(gpde));
+
+        spl3e = (pgentry_64_t *) map_domain_page_with_cache(sl3mfn, cache);
+        validate_entry_change(d, (pgentry_64_t *) &gpde,
+                             &spl3e[(pa & ~PAGE_MASK) / sizeof(l3_pgentry_t)], 
+                             shadow_type_to_level(PGT_l3_shadow));
+        unmap_domain_page_with_cache(spl3e, cache);
+    }
+
+    shadow_unlock(d);
+}
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 4
+void shadow_l4_normal_pt_update(
+    struct domain *d,
+    unsigned long pa, l4_pgentry_t gpde,
+    struct domain_mmap_cache *cache)
+{
+    unsigned long sl4mfn;
+    pgentry_64_t *spl4e;
+
+    shadow_lock(d);
+
+    sl4mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l4_shadow);
+    if ( sl4mfn )
+    {
+        SH_VVLOG("shadow_l4_normal_pt_update pa=%p, gpde=%" PRIpte,
+                 (void *)pa, l4e_get_intpte(gpde));
+
+        spl4e = (pgentry_64_t *)map_domain_page_with_cache(sl4mfn, cache);
+        validate_entry_change(d, (pgentry_64_t *)&gpde,
+                             &spl4e[(pa & ~PAGE_MASK) / sizeof(l4_pgentry_t)], 
+                             shadow_type_to_level(PGT_l4_shadow));
+        unmap_domain_page_with_cache(spl4e, cache);
+    }
+
+    shadow_unlock(d);
+}
+#endif
+
+static void
+translate_l1pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l1mfn)
+{
+    int i;
+    l1_pgentry_t *l1;
+
+    l1 = map_domain_page(l1mfn);
+    for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
+    {
+        if ( is_guest_l1_slot(i) &&
+             (l1e_get_flags(l1[i]) & _PAGE_PRESENT) )
+        {
+            unsigned long mfn = l1e_get_pfn(l1[i]);
+            unsigned long gpfn = __mfn_to_gpfn(d, mfn);
+            ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
+            l1[i] = l1e_from_pfn(gpfn, l1e_get_flags(l1[i]));
+        }
+    }
+    unmap_domain_page(l1);
+}
+
+// This is not general enough to handle arbitrary pagetables
+// with shared L1 pages, etc., but it is sufficient for bringing
+// up dom0.
+//
+void
+translate_l2pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn,
+                    unsigned int type)
+{
+    int i;
+    l2_pgentry_t *l2;
+
+    ASSERT(shadow_mode_translate(d) && !shadow_mode_external(d));
+
+    l2 = map_domain_page(l2mfn);
+    for (i = 0; i < L2_PAGETABLE_ENTRIES; i++)
+    {
+        if ( is_guest_l2_slot(type, i) &&
+             (l2e_get_flags(l2[i]) & _PAGE_PRESENT) )
+        {
+            unsigned long mfn = l2e_get_pfn(l2[i]);
+            unsigned long gpfn = __mfn_to_gpfn(d, mfn);
+            ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
+            l2[i] = l2e_from_pfn(gpfn, l2e_get_flags(l2[i]));
+            translate_l1pgtable(d, p2m, mfn);
+        }
+    }
+    unmap_domain_page(l2);
+}
+
+void
+remove_shadow(struct domain *d, unsigned long gpfn, u32 stype)
+{
+    unsigned long smfn;
+
+    //printk("%s(gpfn=%lx, type=%x)\n", __func__, gpfn, stype);
+
+    shadow_lock(d);
+
+    while ( stype >= PGT_l1_shadow )
+    {
+        smfn = __shadow_status(d, gpfn, stype);
+        if ( smfn && MFN_PINNED(smfn) )
+            shadow_unpin(smfn);
+        stype -= PGT_l1_shadow;
+    }
+
+    shadow_unlock(d);
+}
+
+unsigned long
+gpfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
+{
+    ASSERT( shadow_mode_translate(d) );
+
+    perfc_incrc(gpfn_to_mfn_foreign);
+
+    unsigned long va = gpfn << PAGE_SHIFT;
+    unsigned long tabpfn = pagetable_get_pfn(d->arch.phys_table);
+    l2_pgentry_t *l2 = map_domain_page(tabpfn);
+    l2_pgentry_t l2e = l2[l2_table_offset(va)];
+    unmap_domain_page(l2);
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
+    {
+        printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%lx) => 0 l2e=%" PRIpte 
"\n",
+               d->domain_id, gpfn, l2e_get_intpte(l2e));
+        return INVALID_MFN;
+    }
+    l1_pgentry_t *l1 = map_domain_page(l2e_get_pfn(l2e));
+    l1_pgentry_t l1e = l1[l1_table_offset(va)];
+    unmap_domain_page(l1);
+
+#if 0
+    printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%lx) => %lx tabpfn=%lx l2e=%lx 
l1tab=%lx, l1e=%lx\n",
+           d->domain_id, gpfn, l1_pgentry_val(l1e) >> PAGE_SHIFT, tabpfn, l2e, 
l1tab, l1e);
+#endif
+
+    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
+    {
+        printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%lx) => 0 l1e=%" PRIpte 
"\n",
+               d->domain_id, gpfn, l1e_get_intpte(l1e));
+        return INVALID_MFN;
+    }
+
+    return l1e_get_pfn(l1e);
+}
+
+static u32 remove_all_access_in_page(
+  struct domain *d, unsigned long l1mfn, unsigned long forbidden_gmfn)
+{
+    l1_pgentry_t *pl1e = map_domain_page(l1mfn);
+    l1_pgentry_t match;
+    unsigned long flags  = _PAGE_PRESENT;
+    int i;
+    u32 count = 0;
+    int is_l1_shadow =
+      ((frame_table[l1mfn].u.inuse.type_info & PGT_type_mask) ==
+       PGT_l1_shadow);
+
+    match = l1e_from_pfn(forbidden_gmfn, flags);
+
+    for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
+    {
+        if ( unlikely(!l1e_has_changed(pl1e[i], match, flags) == 0) )
+        {
+            l1_pgentry_t ol2e = pl1e[i];
+            pl1e[i] = l1e_empty();
+            count++;
+
+            if ( is_l1_shadow )
+                shadow_put_page_from_l1e(ol2e, d);
+            else /* must be an hl2 page */
+                put_page(&frame_table[forbidden_gmfn]);
+        }
+    }
+
+    unmap_domain_page(pl1e);
+
+    return count;
+}
+
+static u32 __shadow_remove_all_access(struct domain *d, unsigned long 
forbidden_gmfn)
+{
+    int i;
+    struct shadow_status *a;
+    u32 count = 0;
+
+    if ( unlikely(!shadow_mode_enabled(d)) )
+        return 0;
+
+    ASSERT(shadow_lock_is_acquired(d));
+    perfc_incrc(remove_all_access);
+
+    for (i = 0; i < shadow_ht_buckets; i++)
+    {
+        a = &d->arch.shadow_ht[i];
+        while ( a && a->gpfn_and_flags )
+        {
+            switch (a->gpfn_and_flags & PGT_type_mask)
+            {
+                case PGT_l1_shadow:
+                case PGT_l2_shadow:
+                case PGT_l3_shadow:
+                case PGT_l4_shadow:
+                case PGT_hl2_shadow:
+                    count += remove_all_access_in_page(d, a->smfn, 
forbidden_gmfn);
+                    break;
+                case PGT_snapshot:
+                case PGT_writable_pred:
+                    // these can't hold refs to the forbidden page
+                    break;
+                default:
+                    BUG();
+            }
+
+            a = a->next;
+        }
+    }
+
+    return count;
+}
+
+void shadow_drop_references(
+  struct domain *d, struct pfn_info *page)
+{
+    if ( likely(!shadow_mode_refcounts(d)) ||
+      ((page->u.inuse.type_info & PGT_count_mask) == 0) )
+        return;
+
+    /* XXX This needs more thought... */
+    printk("%s: needing to call __shadow_remove_all_access for mfn=%lx\n",
+      __func__, page_to_pfn(page));
+    printk("Before: mfn=%lx c=%08x t=%08x\n", page_to_pfn(page),
+      page->count_info, page->u.inuse.type_info);
+
+    shadow_lock(d);
+    __shadow_remove_all_access(d, page_to_pfn(page));
+    shadow_unlock(d);
+
+    printk("After:  mfn=%lx c=%08x t=%08x\n", page_to_pfn(page),
+      page->count_info, page->u.inuse.type_info);
+}
+
+/* XXX Needs more thought. Neither pretty nor fast: a place holder. */
+void shadow_sync_and_drop_references(
+  struct domain *d, struct pfn_info *page)
+{
+    if ( likely(!shadow_mode_refcounts(d)) )
+        return;
+
+    shadow_lock(d);
+
+    if ( page_out_of_sync(page) )
+        __shadow_sync_mfn(d, page_to_pfn(page));
+
+    __shadow_remove_all_access(d, page_to_pfn(page));
+
+    shadow_unlock(d);
+}
diff -r d332d4df452e -r 0bcfd66a431e xen/arch/x86/shadow32.c
--- /dev/null   Mon Jul 11 09:22:15 2005
+++ b/xen/arch/x86/shadow32.c   Mon Jul 11 09:57:38 2005
@@ -0,0 +1,3388 @@
+/******************************************************************************
+ * arch/x86/shadow.c
+ * 
+ * Copyright (c) 2005 Michael A Fetterman
+ * Based on an earlier implementation by Ian Pratt et al
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/domain_page.h>
+#include <asm/shadow.h>
+#include <asm/page.h>
+#include <xen/event.h>
+#include <xen/sched.h>
+#include <xen/trace.h>
+
+#define MFN_PINNED(_x) (frame_table[_x].u.inuse.type_info & PGT_pinned)
+
+static void shadow_free_snapshot(struct domain *d,
+                                 struct out_of_sync_entry *entry);
+static void remove_out_of_sync_entries(struct domain *d, unsigned long smfn);
+static void free_writable_pte_predictions(struct domain *d);
+
+#if SHADOW_DEBUG
+static void mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned 
long gpfn);
+#endif
+
+/********
+
+There's a per-domain shadow table spin lock which works fine for SMP
+hosts. We don't have to worry about interrupts as no shadow operations
+happen in an interrupt context. It's probably not quite ready for SMP
+guest operation as we have to worry about synchonisation between gpte
+and spte updates. Its possible that this might only happen in a
+hypercall context, in which case we'll probably at have a per-domain
+hypercall lock anyhow (at least initially).
+
+********/
+
+static inline int
+shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn,
+               unsigned long new_type)
+{
+    struct pfn_info *page = pfn_to_page(gmfn);
+    int pinned = 0, okay = 1;
+
+    if ( page_out_of_sync(page) )
+    {
+        // Don't know how long ago this snapshot was taken.
+        // Can't trust it to be recent enough.
+        //
+        __shadow_sync_mfn(d, gmfn);
+    }
+
+    if ( !shadow_mode_refcounts(d) )
+        return 1;
+
+    if ( unlikely(page_is_page_table(page)) )
+        return 1;
+
+    FSH_LOG("%s: gpfn=%lx gmfn=%lx nt=%08lx", __func__, gpfn, gmfn, new_type);
+
+    if ( !shadow_remove_all_write_access(d, gpfn, gmfn) )
+    {
+        FSH_LOG("%s: couldn't find/remove all write accesses, gpfn=%lx 
gmfn=%lx",
+                __func__, gpfn, gmfn);
+#if 1 || defined(LIVE_DANGEROUSLY)
+        set_bit(_PGC_page_table, &page->count_info);
+        return 1;
+#endif
+        return 0;
+        
+    }
+
+    // To convert this page to use as a page table, the writable count
+    // should now be zero.  Test this by grabbing the page as an page table,
+    // and then immediately releasing.  This will also deal with any
+    // necessary TLB flushing issues for us.
+    //
+    // The cruft here about pinning doesn't really work right.  This
+    // needs rethinking/rewriting...  Need to gracefully deal with the
+    // TLB flushes required when promoting a writable page, and also deal
+    // with any outstanding (external) writable refs to this page (by
+    // refusing to promote it).  The pinning headache complicates this
+    // code -- it would all get much simpler if we stop using
+    // shadow_lock() and move the shadow code to BIGLOCK().
+    //
+    if ( unlikely(!get_page(page, d)) )
+        BUG(); // XXX -- needs more thought for a graceful failure
+    if ( unlikely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) )
+    {
+        pinned = 1;
+        put_page_and_type(page);
+    }
+    if ( get_page_type(page, PGT_base_page_table) )
+    {
+        set_bit(_PGC_page_table, &page->count_info);
+        put_page_type(page);
+    }
+    else
+    {
+        printk("shadow_promote: get_page_type failed "
+               "dom%d gpfn=%lx gmfn=%lx t=%08lx\n",
+               d->domain_id, gpfn, gmfn, new_type);
+        okay = 0;
+    }
+
+    // Now put the type back to writable...
+    if ( unlikely(!get_page_type(page, PGT_writable_page)) )
+        BUG(); // XXX -- needs more thought for a graceful failure
+    if ( unlikely(pinned) )
+    {
+        if ( unlikely(test_and_set_bit(_PGT_pinned,
+                                       &page->u.inuse.type_info)) )
+            BUG(); // hmm... someone pinned this again?
+    }
+    else
+        put_page_and_type(page);
+
+    return okay;
+}
+
+static inline void
+shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn)
+{
+    if ( !shadow_mode_refcounts(d) )
+        return;
+
+    ASSERT(frame_table[gmfn].count_info & PGC_page_table);
+
+    if ( shadow_max_pgtable_type(d, gpfn, NULL) == PGT_none )
+    {
+        clear_bit(_PGC_page_table, &frame_table[gmfn].count_info);
+
+        if ( page_out_of_sync(pfn_to_page(gmfn)) )
+        {
+            remove_out_of_sync_entries(d, gmfn);
+        }
+    }
+}
+
+/*
+ * Things in shadow mode that collect get_page() refs to the domain's
+ * pages are:
+ * - PGC_allocated takes a gen count, just like normal.
+ * - A writable page can be pinned (paravirtualized guests may consider
+ *   these pages to be L1s or L2s, and don't know the difference).
+ *   Pinning a page takes a gen count (but, for domains in shadow mode,
+ *   it *doesn't* take a type count)
+ * - CR3 grabs a ref to whatever it points at, just like normal.
+ * - Shadow mode grabs an initial gen count for itself, as a placehold
+ *   for whatever references will exist.
+ * - Shadow PTEs that point to a page take a gen count, just like regular
+ *   PTEs.  However, they don't get a type count, as get_page_type() is
+ *   hardwired to keep writable pages' counts at 1 for domains in shadow
+ *   mode.
+ * - Whenever we shadow a page, the entry in the shadow hash grabs a
+ *   general ref to the page.
+ * - Whenever a page goes out of sync, the out of sync entry grabs a
+ *   general ref to the page.
+ */
+/*
+ * pfn_info fields for pages allocated as shadow pages:
+ *
+ * All 32 bits of count_info are a simple count of refs to this shadow
+ * from a) other shadow pages, b) current CR3's (aka ed->arch.shadow_table),
+ * c) if it's a pinned shadow root pgtable, d) outstanding out-of-sync
+ * references.
+ *
+ * u.inuse._domain is left NULL, to prevent accidently allow some random
+ * domain from gaining permissions to map this page.
+ *
+ * u.inuse.type_info & PGT_type_mask remembers what kind of page is being
+ * shadowed.
+ * u.inuse.type_info & PGT_mfn_mask holds the mfn of the page being shadowed.
+ * u.inuse.type_info & PGT_pinned says that an extra reference to this shadow
+ * is currently exists because this is a shadow of a root page, and we
+ * don't want to let those disappear just because no CR3 is currently pointing
+ * at it.
+ *
+ * tlbflush_timestamp holds a min & max index of valid page table entries
+ * within the shadow page.
+ */
+
+static inline unsigned long
+alloc_shadow_page(struct domain *d,
+                  unsigned long gpfn, unsigned long gmfn,
+                  u32 psh_type)
+{
+    struct pfn_info *page;
+    unsigned long smfn;
+    int pin = 0;
+
+    // Currently, we only keep pre-zero'ed pages around for use as L1's...
+    // This will change.  Soon.
+    //
+    if ( psh_type == PGT_l1_shadow )
+    {
+        if ( !list_empty(&d->arch.free_shadow_frames) )
+        {
+            struct list_head *entry = d->arch.free_shadow_frames.next;
+            page = list_entry(entry, struct pfn_info, list);
+            list_del(entry);
+            perfc_decr(free_l1_pages);
+        }
+        else
+        {
+            page = alloc_domheap_page(NULL);
+            void *l1 = map_domain_page(page_to_pfn(page));
+            memset(l1, 0, PAGE_SIZE);
+            unmap_domain_page(l1);
+        }
+    }
+    else
+        page = alloc_domheap_page(NULL);
+
+    if ( unlikely(page == NULL) )
+    {
+        printk("Couldn't alloc shadow page! dom%d count=%d\n",
+               d->domain_id, d->arch.shadow_page_count);
+        printk("Shadow table counts: l1=%d l2=%d hl2=%d snapshot=%d\n",
+               perfc_value(shadow_l1_pages), 
+               perfc_value(shadow_l2_pages),
+               perfc_value(hl2_table_pages),
+               perfc_value(snapshot_pages));
+        BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
+    }
+
+    smfn = page_to_pfn(page);
+
+    ASSERT( (gmfn & ~PGT_mfn_mask) == 0 );
+    page->u.inuse.type_info = psh_type | gmfn;
+    page->count_info = 0;
+    page->tlbflush_timestamp = 0;
+
+    switch ( psh_type )
+    {
+    case PGT_l1_shadow:
+        if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
+            goto fail;
+        perfc_incr(shadow_l1_pages);
+        d->arch.shadow_page_count++;
+        break;
+
+    case PGT_l2_shadow:
+        if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
+            goto fail;
+        perfc_incr(shadow_l2_pages);
+        d->arch.shadow_page_count++;
+        if ( PGT_l2_page_table == PGT_root_page_table )
+            pin = 1;
+
+        break;
+
+    case PGT_hl2_shadow:
+        // Treat an hl2 as an L1 for purposes of promotion.
+        // For external mode domains, treat them as an L2 for purposes of
+        // pinning.
+        //
+        if ( !shadow_promote(d, gpfn, gmfn, PGT_l1_shadow) )
+            goto fail;
+        perfc_incr(hl2_table_pages);
+        d->arch.hl2_page_count++;
+        if ( shadow_mode_external(d) &&
+             (PGT_l2_page_table == PGT_root_page_table) )
+            pin = 1;
+
+        break;
+
+    case PGT_snapshot:
+        perfc_incr(snapshot_pages);
+        d->arch.snapshot_page_count++;
+        break;
+
+    default:
+        printk("Alloc shadow weird page type type=%08x\n", psh_type);
+        BUG();
+        break;
+    }
+
+    // Don't add a new shadow of something that already has a snapshot.
+    //
+    ASSERT( (psh_type == PGT_snapshot) || !mfn_out_of_sync(gmfn) );
+
+    set_shadow_status(d, gpfn, gmfn, smfn, psh_type);
+
+    if ( pin )
+        shadow_pin(smfn);
+
+    return smfn;
+
+  fail:
+    FSH_LOG("promotion of pfn=%lx mfn=%lx failed!  external gnttab refs?",
+            gpfn, gmfn);
+    free_domheap_page(page);
+    return 0;
+}
+
+static void inline
+free_shadow_l1_table(struct domain *d, unsigned long smfn)
+{
+    l1_pgentry_t *pl1e = map_domain_page(smfn);
+    int i;
+    struct pfn_info *spage = pfn_to_page(smfn);
+    u32 min_max = spage->tlbflush_timestamp;
+    int min = SHADOW_MIN(min_max);
+    int max = SHADOW_MAX(min_max);
+
+    for ( i = min; i <= max; i++ )
+    {
+        shadow_put_page_from_l1e(pl1e[i], d);
+        pl1e[i] = l1e_empty();
+    }
+
+    unmap_domain_page(pl1e);
+}
+
+static void inline
+free_shadow_hl2_table(struct domain *d, unsigned long smfn)
+{
+    l1_pgentry_t *hl2 = map_domain_page(smfn);
+    int i, limit;
+
+    SH_VVLOG("%s: smfn=%lx freed", __func__, smfn);
+
+#ifdef __i386__
+    if ( shadow_mode_external(d) )
+        limit = L2_PAGETABLE_ENTRIES;
+    else
+        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
+#else
+    limit = 0; /* XXX x86/64 XXX */
+#endif
+
+    for ( i = 0; i < limit; i++ )
+    {
+        if ( l1e_get_flags(hl2[i]) & _PAGE_PRESENT )
+            put_page(pfn_to_page(l1e_get_pfn(hl2[i])));
+    }
+
+    unmap_domain_page(hl2);
+}
+
+static void inline
+free_shadow_l2_table(struct domain *d, unsigned long smfn, unsigned int type)
+{
+    l2_pgentry_t *pl2e = map_domain_page(smfn);
+    int i, external = shadow_mode_external(d);
+
+    for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
+        if ( external || is_guest_l2_slot(type, i) )
+            if ( l2e_get_flags(pl2e[i]) & _PAGE_PRESENT )
+                put_shadow_ref(l2e_get_pfn(pl2e[i]));
+
+    if ( (PGT_base_page_table == PGT_l2_page_table) &&
+         shadow_mode_translate(d) && !external )
+    {
+        // free the ref to the hl2
+        //
+        
put_shadow_ref(l2e_get_pfn(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)]));
+    }
+
+    unmap_domain_page(pl2e);
+}
+
+void free_shadow_page(unsigned long smfn)
+{
+    struct pfn_info *page = &frame_table[smfn];
+    unsigned long gmfn = page->u.inuse.type_info & PGT_mfn_mask;
+    struct domain *d = page_get_owner(pfn_to_page(gmfn));
+    unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
+    unsigned long type = page->u.inuse.type_info & PGT_type_mask;
+
+    SH_VVLOG("%s: free'ing smfn=%lx", __func__, smfn);
+
+    ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
+
+    delete_shadow_status(d, gpfn, gmfn, type);
+
+    switch ( type )
+    {
+    case PGT_l1_shadow:
+        perfc_decr(shadow_l1_pages);
+        shadow_demote(d, gpfn, gmfn);
+        free_shadow_l1_table(d, smfn);
+        break;
+
+    case PGT_l2_shadow:
+        perfc_decr(shadow_l2_pages);
+        shadow_demote(d, gpfn, gmfn);
+        free_shadow_l2_table(d, smfn, page->u.inuse.type_info);
+        break;
+
+    case PGT_hl2_shadow:
+        perfc_decr(hl2_table_pages);
+        shadow_demote(d, gpfn, gmfn);
+        free_shadow_hl2_table(d, smfn);
+        break;
+
+    case PGT_snapshot:
+        perfc_decr(snapshot_pages);
+        break;
+
+    default:
+        printk("Free shadow weird page type mfn=%lx type=%08x\n",
+               page_to_pfn(page), page->u.inuse.type_info);
+        break;
+    }
+
+    d->arch.shadow_page_count--;
+
+    // No TLB flushes are needed the next time this page gets allocated.
+    //
+    page->tlbflush_timestamp = 0;
+    page->u.free.cpumask     = CPU_MASK_NONE;
+
+    if ( type == PGT_l1_shadow )
+    {
+        list_add(&page->list, &d->arch.free_shadow_frames);
+        perfc_incr(free_l1_pages);
+    }
+    else
+        free_domheap_page(page);
+}
+
+void
+remove_shadow(struct domain *d, unsigned long gpfn, u32 stype)
+{
+    unsigned long smfn;
+
+    //printk("%s(gpfn=%lx, type=%x)\n", __func__, gpfn, stype);
+
+    shadow_lock(d);
+
+    while ( stype >= PGT_l1_shadow )
+    {
+        smfn = __shadow_status(d, gpfn, stype);
+        if ( smfn && MFN_PINNED(smfn) )
+            shadow_unpin(smfn);
+        stype -= PGT_l1_shadow;
+    }
+
+    shadow_unlock(d);
+}
+
+static void inline
+release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry)
+{
+    struct pfn_info *page;
+
+    page = &frame_table[entry->gmfn];
+        
+    // Decrement ref count of guest & shadow pages
+    //
+    put_page(page);
+
+    // Only use entries that have low bits clear...
+    //
+    if ( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
+    {
+        put_shadow_ref(entry->writable_pl1e >> PAGE_SHIFT);
+        entry->writable_pl1e = -2;
+    }
+    else
+        ASSERT( entry->writable_pl1e == -1 );
+
+    // Free the snapshot
+    //
+    shadow_free_snapshot(d, entry);
+}
+
+static void remove_out_of_sync_entries(struct domain *d, unsigned long gmfn)
+{
+    struct out_of_sync_entry *entry = d->arch.out_of_sync;
+    struct out_of_sync_entry **prev = &d->arch.out_of_sync;
+    struct out_of_sync_entry *found = NULL;
+
+    // NB: Be careful not to call something that manipulates this list
+    //     while walking it.  Collect the results into a separate list
+    //     first, then walk that list.
+    //
+    while ( entry )
+    {
+        if ( entry->gmfn == gmfn )
+        {
+            // remove from out of sync list
+            *prev = entry->next;
+
+            // add to found list
+            entry->next = found;
+            found = entry;
+
+            entry = *prev;
+            continue;
+        }
+        prev = &entry->next;
+        entry = entry->next;
+    }
+
+    prev = NULL;
+    entry = found;
+    while ( entry )
+    {
+        release_out_of_sync_entry(d, entry);
+
+        prev = &entry->next;
+        entry = entry->next;
+    }
+
+    // Add found list to free list
+    if ( prev )
+    {
+        *prev = d->arch.out_of_sync_free;
+        d->arch.out_of_sync_free = found;
+    }
+}
+
+static void free_out_of_sync_state(struct domain *d)
+{
+    struct out_of_sync_entry *entry;
+
+    // NB: Be careful not to call something that manipulates this list
+    //     while walking it.  Remove one item at a time, and always
+    //     restart from start of list.
+    //
+    while ( (entry = d->arch.out_of_sync) )
+    {
+        d->arch.out_of_sync = entry->next;
+        release_out_of_sync_entry(d, entry);
+
+        entry->next = d->arch.out_of_sync_free;
+        d->arch.out_of_sync_free = entry;
+    }
+}
+
+static void free_shadow_pages(struct domain *d)
+{
+    int                   i;
+    struct shadow_status *x;
+    struct vcpu          *v;
+ 
+    /*
+     * WARNING! The shadow page table must not currently be in use!
+     * e.g., You are expected to have paused the domain and synchronized CR3.
+     */
+
+    if( !d->arch.shadow_ht ) return;
+
+    shadow_audit(d, 1);
+
+    // first, remove any outstanding refs from out_of_sync entries...
+    //
+    free_out_of_sync_state(d);
+
+    // second, remove any outstanding refs from v->arch.shadow_table
+    // and CR3.
+    //
+    for_each_vcpu(d, v)
+    {
+        if ( pagetable_get_paddr(v->arch.shadow_table) )
+        {
+            put_shadow_ref(pagetable_get_pfn(v->arch.shadow_table));
+            v->arch.shadow_table = mk_pagetable(0);
+        }
+
+        if ( v->arch.monitor_shadow_ref )
+        {
+            put_shadow_ref(v->arch.monitor_shadow_ref);
+            v->arch.monitor_shadow_ref = 0;
+        }
+    }
+
+    // For external shadows, remove the monitor table's refs
+    //
+    if ( shadow_mode_external(d) )
+    {
+        for_each_vcpu(d, v)
+        {
+            l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
+
+            if ( mpl2e )
+            {
+                l2_pgentry_t hl2e = 
mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
+                l2_pgentry_t smfn = 
mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
+
+                if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
+                {
+                    put_shadow_ref(l2e_get_pfn(hl2e));
+                    mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
+                }
+                if ( l2e_get_flags(smfn) & _PAGE_PRESENT )
+                {
+                    put_shadow_ref(l2e_get_pfn(smfn));
+                    mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = 
l2e_empty();
+                }
+            }
+        }
+    }
+
+    // Now, the only refs to shadow pages that are left are from the shadow
+    // pages themselves.  We just unpin the pinned pages, and the rest
+    // should automatically disappear.
+    //
+    // NB: Beware: each explicitly or implicit call to free_shadow_page
+    // can/will result in the hash bucket getting rewritten out from
+    // under us...  First, collect the list of pinned pages, then
+    // free them.
+    //
+    for ( i = 0; i < shadow_ht_buckets; i++ )
+    {
+        u32 count;
+        unsigned long *mfn_list;
+
+        /* Skip empty buckets. */
+        if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
+            continue;
+
+        count = 0;
+        for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
+            if ( MFN_PINNED(x->smfn) )
+                count++;
+        if ( !count )
+            continue;
+
+        mfn_list = xmalloc_array(unsigned long, count);
+        count = 0;
+        for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
+            if ( MFN_PINNED(x->smfn) )
+                mfn_list[count++] = x->smfn;
+
+        while ( count )
+        {
+            shadow_unpin(mfn_list[--count]);
+        }
+        xfree(mfn_list);
+    }
+
+    // Now free the pre-zero'ed pages from the domain
+    //
+    struct list_head *list_ent, *tmp;
+    list_for_each_safe(list_ent, tmp, &d->arch.free_shadow_frames)
+    {
+        list_del(list_ent);
+        perfc_decr(free_l1_pages);
+
+        struct pfn_info *page = list_entry(list_ent, struct pfn_info, list);
+        free_domheap_page(page);
+    }
+
+    shadow_audit(d, 0);
+
+    SH_LOG("Free shadow table.");
+}
+
+void shadow_mode_init(void)
+{
+}
+
+int _shadow_mode_refcounts(struct domain *d)
+{
+    return shadow_mode_refcounts(d);
+}
+
+void alloc_monitor_pagetable(struct vcpu *v)
+{
+    unsigned long mmfn;
+    l2_pgentry_t *mpl2e;
+    struct pfn_info *mmfn_info;
+    struct domain *d = v->domain;
+
+    ASSERT(pagetable_get_paddr(v->arch.monitor_table) == 0);
+
+    mmfn_info = alloc_domheap_page(NULL);
+    ASSERT(mmfn_info != NULL);
+
+    mmfn = page_to_pfn(mmfn_info);
+    mpl2e = (l2_pgentry_t *)map_domain_page(mmfn);
+    memset(mpl2e, 0, PAGE_SIZE);
+
+#ifdef __i386__ /* XXX screws x86/64 build */
+    memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
+           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
+           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
+#endif
+
+    mpl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
+        l2e_from_paddr(__pa(d->arch.mm_perdomain_pt),
+                        __PAGE_HYPERVISOR);
+
+    // map the phys_to_machine map into the Read-Only MPT space for this domain
+    mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
+        l2e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
+                        __PAGE_HYPERVISOR);
+
+    // Don't (yet) have mappings for these...
+    // Don't want to accidentally see the idle_pg_table's linear mapping.
+    //
+    mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
+    mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty();
+
+    v->arch.monitor_table = mk_pagetable(mmfn << PAGE_SHIFT);
+    v->arch.monitor_vtable = mpl2e;
+}
+
+/*
+ * Free the pages for monitor_table and hl2_table
+ */
+void free_monitor_pagetable(struct vcpu *v)
+{
+    l2_pgentry_t *mpl2e, hl2e, sl2e;
+    unsigned long mfn;
+
+    ASSERT( pagetable_get_paddr(v->arch.monitor_table) );
+    
+    mpl2e = v->arch.monitor_vtable;
+
+    /*
+     * First get the mfn for hl2_table by looking at monitor_table
+     */
+    hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
+    if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
+    {
+        mfn = l2e_get_pfn(hl2e);
+        ASSERT(mfn);
+        put_shadow_ref(mfn);
+    }
+
+    sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
+    if ( l2e_get_flags(sl2e) & _PAGE_PRESENT )
+    {
+        mfn = l2e_get_pfn(sl2e);
+        ASSERT(mfn);
+        put_shadow_ref(mfn);
+    }
+
+    unmap_domain_page(mpl2e);
+
+    /*
+     * Then free monitor_table.
+     */
+    mfn = pagetable_get_pfn(v->arch.monitor_table);
+    free_domheap_page(&frame_table[mfn]);
+
+    v->arch.monitor_table = mk_pagetable(0);
+    v->arch.monitor_vtable = 0;
+}
+
+int
+set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn,
+              struct domain_mmap_cache *l2cache,
+              struct domain_mmap_cache *l1cache)
+{
+    unsigned long tabpfn = pagetable_get_pfn(d->arch.phys_table);
+    l2_pgentry_t *l2, l2e;
+    l1_pgentry_t *l1;
+    struct pfn_info *l1page;
+    unsigned long va = pfn << PAGE_SHIFT;
+
+    ASSERT(tabpfn != 0);
+
+    l2 = map_domain_page_with_cache(tabpfn, l2cache);
+    l2e = l2[l2_table_offset(va)];
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
+    {
+        l1page = alloc_domheap_page(NULL);
+        if ( !l1page )
+        {
+            unmap_domain_page_with_cache(l2, l2cache);
+            return 0;
+        }
+
+        l1 = map_domain_page_with_cache(page_to_pfn(l1page), l1cache);
+        memset(l1, 0, PAGE_SIZE);
+        unmap_domain_page_with_cache(l1, l1cache);
+
+        l2e = l2e_from_page(l1page, __PAGE_HYPERVISOR);
+        l2[l2_table_offset(va)] = l2e;
+    }
+    unmap_domain_page_with_cache(l2, l2cache);
+
+    l1 = map_domain_page_with_cache(l2e_get_pfn(l2e), l1cache);
+    l1[l1_table_offset(va)] = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
+    unmap_domain_page_with_cache(l1, l1cache);
+
+    return 1;
+}
+
+static int
+alloc_p2m_table(struct domain *d)
+{
+    struct list_head *list_ent;
+    struct pfn_info *page, *l2page;
+    l2_pgentry_t *l2;
+    unsigned long mfn, pfn;
+    struct domain_mmap_cache l1cache, l2cache;
+
+    l2page = alloc_domheap_page(NULL);
+    if ( l2page == NULL )
+        return 0;
+
+    domain_mmap_cache_init(&l1cache);
+    domain_mmap_cache_init(&l2cache);
+
+    d->arch.phys_table = mk_pagetable(page_to_phys(l2page));
+    l2 = map_domain_page_with_cache(page_to_pfn(l2page), &l2cache);
+    memset(l2, 0, PAGE_SIZE);
+    unmap_domain_page_with_cache(l2, &l2cache);
+
+    list_ent = d->page_list.next;
+    while ( list_ent != &d->page_list )
+    {
+        page = list_entry(list_ent, struct pfn_info, list);
+        mfn = page_to_pfn(page);
+        pfn = machine_to_phys_mapping[mfn];
+        ASSERT(pfn != INVALID_M2P_ENTRY);
+        ASSERT(pfn < (1u<<20));
+
+        set_p2m_entry(d, pfn, mfn, &l2cache, &l1cache);
+
+        list_ent = page->list.next;
+    }
+
+    list_ent = d->xenpage_list.next;
+    while ( list_ent != &d->xenpage_list )
+    {
+        page = list_entry(list_ent, struct pfn_info, list);
+        mfn = page_to_pfn(page);
+        pfn = machine_to_phys_mapping[mfn];
+        if ( (pfn != INVALID_M2P_ENTRY) &&
+             (pfn < (1u<<20)) )
+        {
+            set_p2m_entry(d, pfn, mfn, &l2cache, &l1cache);
+        }
+
+        list_ent = page->list.next;
+    }
+
+    domain_mmap_cache_destroy(&l2cache);
+    domain_mmap_cache_destroy(&l1cache);
+
+    return 1;
+}
+
+static void
+free_p2m_table(struct domain *d)
+{
+    // uh, this needs some work...  :)
+    BUG();
+}
+
+int __shadow_mode_enable(struct domain *d, unsigned int mode)
+{
+    struct vcpu *v;
+    int new_modes = (mode & ~d->arch.shadow_mode);
+
+    // Gotta be adding something to call this function.
+    ASSERT(new_modes);
+
+    // can't take anything away by calling this function.
+    ASSERT(!(d->arch.shadow_mode & ~mode));
+
+    for_each_vcpu(d, v)
+    {
+        invalidate_shadow_ldt(v);
+
+        // We need to set these up for __update_pagetables().
+        // See the comment there.
+
+        /*
+         * arch.guest_vtable
+         */
+        if ( v->arch.guest_vtable &&
+             (v->arch.guest_vtable != __linear_l2_table) )
+        {
+            unmap_domain_page(v->arch.guest_vtable);
+        }
+        if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
+            v->arch.guest_vtable = __linear_l2_table;
+        else
+            v->arch.guest_vtable = NULL;
+
+        /*
+         * arch.shadow_vtable
+         */
+        if ( v->arch.shadow_vtable &&
+             (v->arch.shadow_vtable != __shadow_linear_l2_table) )
+        {
+            unmap_domain_page(v->arch.shadow_vtable);
+        }
+        if ( !(mode & SHM_external) )
+            v->arch.shadow_vtable = __shadow_linear_l2_table;
+        else
+            v->arch.shadow_vtable = NULL;
+
+        /*
+         * arch.hl2_vtable
+         */
+        if ( v->arch.hl2_vtable &&
+             (v->arch.hl2_vtable != __linear_hl2_table) )
+        {
+            unmap_domain_page(v->arch.hl2_vtable);
+        }
+        if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
+            v->arch.hl2_vtable = __linear_hl2_table;
+        else
+            v->arch.hl2_vtable = NULL;
+
+        /*
+         * arch.monitor_table & arch.monitor_vtable
+         */
+        if ( v->arch.monitor_vtable )
+        {
+            free_monitor_pagetable(v);
+        }
+        if ( mode & SHM_external )
+        {
+            alloc_monitor_pagetable(v);
+        }
+    }
+
+    if ( new_modes & SHM_enable )
+    {
+        ASSERT( !d->arch.shadow_ht );
+        d->arch.shadow_ht = xmalloc_array(struct shadow_status, 
shadow_ht_buckets);
+        if ( d->arch.shadow_ht == NULL )
+            goto nomem;
+
+        memset(d->arch.shadow_ht, 0,
+           shadow_ht_buckets * sizeof(struct shadow_status));
+    }
+
+    if ( new_modes & SHM_log_dirty )
+    {
+        ASSERT( !d->arch.shadow_dirty_bitmap );
+        d->arch.shadow_dirty_bitmap_size = (d->max_pages + 63) & ~63;
+        d->arch.shadow_dirty_bitmap = 
+            xmalloc_array(unsigned long, d->arch.shadow_dirty_bitmap_size /
+                                         (8 * sizeof(unsigned long)));
+        if ( d->arch.shadow_dirty_bitmap == NULL )
+        {
+            d->arch.shadow_dirty_bitmap_size = 0;
+            goto nomem;
+        }
+        memset(d->arch.shadow_dirty_bitmap, 0, 
+               d->arch.shadow_dirty_bitmap_size/8);
+    }
+
+    if ( new_modes & SHM_translate )
+    {
+        if ( !(new_modes & SHM_external) )
+        {
+            ASSERT( !pagetable_get_paddr(d->arch.phys_table) );
+            if ( !alloc_p2m_table(d) )
+            {
+                printk("alloc_p2m_table failed (out-of-memory?)\n");
+                goto nomem;
+            }
+        }
+        else
+        {
+            // external guests provide their own memory for their P2M maps.
+            //
+            ASSERT( d == page_get_owner(
+                        &frame_table[pagetable_get_pfn(d->arch.phys_table)]) );
+        }
+    }
+
+    printk("audit1\n");
+    _audit_domain(d, AUDIT_SHADOW_ALREADY_LOCKED | AUDIT_ERRORS_OK);
+    printk("audit1 done\n");
+
+    // Get rid of any shadow pages from any previous shadow mode.
+    //
+    free_shadow_pages(d);
+
+    printk("audit2\n");
+    _audit_domain(d, AUDIT_SHADOW_ALREADY_LOCKED | AUDIT_ERRORS_OK);
+    printk("audit2 done\n");
+
+    /*
+     * Tear down it's counts by disassembling its page-table-based ref counts.
+     * Also remove CR3's gcount/tcount.
+     * That leaves things like GDTs and LDTs and external refs in tact.
+     *
+     * Most pages will be writable tcount=0.
+     * Some will still be L1 tcount=0 or L2 tcount=0.
+     * Maybe some pages will be type none tcount=0.
+     * Pages granted external writable refs (via grant tables?) will
+     * still have a non-zero tcount.  That's OK.
+     *
+     * gcounts will generally be 1 for PGC_allocated.
+     * GDTs and LDTs will have additional gcounts.
+     * Any grant-table based refs will still be in the gcount.
+     *
+     * We attempt to grab writable refs to each page (thus setting its type).
+     * Immediately put back those type refs.
+     *
+     * Assert that no pages are left with L1/L2/L3/L4 type.
+     */
+    audit_adjust_pgtables(d, -1, 1);
+
+    d->arch.shadow_mode = mode;
+
+    if ( shadow_mode_refcounts(d) )
+    {
+        struct list_head *list_ent = d->page_list.next;
+        while ( list_ent != &d->page_list )
+        {
+            struct pfn_info *page = list_entry(list_ent, struct pfn_info, 
list);
+            if ( !get_page_type(page, PGT_writable_page) )
+                BUG();
+            put_page_type(page);
+
+            list_ent = page->list.next;
+        }
+    }
+
+    audit_adjust_pgtables(d, 1, 1);
+
+    printk("audit3\n");
+    _audit_domain(d, AUDIT_SHADOW_ALREADY_LOCKED | AUDIT_ERRORS_OK);
+    printk("audit3 done\n");
+
+    return 0;
+
+ nomem:
+    if ( (new_modes & SHM_enable) )
+    {
+        xfree(d->arch.shadow_ht);
+        d->arch.shadow_ht = NULL;
+    }
+    if ( (new_modes & SHM_log_dirty) )
+    {
+        xfree(d->arch.shadow_dirty_bitmap);
+        d->arch.shadow_dirty_bitmap = NULL;
+    }
+    if ( (new_modes & SHM_translate) && !(new_modes & SHM_external) &&
+         pagetable_get_paddr(d->arch.phys_table) )
+    {
+        free_p2m_table(d);
+    }
+    return -ENOMEM;
+}
+
+int shadow_mode_enable(struct domain *d, unsigned int mode)
+{
+    int rc;
+    shadow_lock(d);
+    rc = __shadow_mode_enable(d, mode);
+    shadow_unlock(d);
+    return rc;
+}
+
+static void
+translate_l1pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l1mfn)
+{
+    int i;
+    l1_pgentry_t *l1;
+
+    l1 = map_domain_page(l1mfn);
+    for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
+    {
+        if ( is_guest_l1_slot(i) &&
+             (l1e_get_flags(l1[i]) & _PAGE_PRESENT) )
+        {
+            unsigned long mfn = l1e_get_pfn(l1[i]);
+            unsigned long gpfn = __mfn_to_gpfn(d, mfn);
+            ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
+            l1[i] = l1e_from_pfn(gpfn, l1e_get_flags(l1[i]));
+        }
+    }
+    unmap_domain_page(l1);
+}
+
+// This is not general enough to handle arbitrary pagetables
+// with shared L1 pages, etc., but it is sufficient for bringing
+// up dom0.
+//
+void
+translate_l2pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn,
+                    unsigned int type)
+{
+    int i;
+    l2_pgentry_t *l2;
+
+    ASSERT(shadow_mode_translate(d) && !shadow_mode_external(d));
+
+    l2 = map_domain_page(l2mfn);
+    for (i = 0; i < L2_PAGETABLE_ENTRIES; i++)
+    {
+        if ( is_guest_l2_slot(type, i) &&
+             (l2e_get_flags(l2[i]) & _PAGE_PRESENT) )
+        {
+            unsigned long mfn = l2e_get_pfn(l2[i]);
+            unsigned long gpfn = __mfn_to_gpfn(d, mfn);
+            ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
+            l2[i] = l2e_from_pfn(gpfn, l2e_get_flags(l2[i]));
+            translate_l1pgtable(d, p2m, mfn);
+        }
+    }
+    unmap_domain_page(l2);
+}
+
+static void free_shadow_ht_entries(struct domain *d)
+{
+    struct shadow_status *x, *n;
+
+    SH_VLOG("freed tables count=%d l1=%d l2=%d",
+            d->arch.shadow_page_count, perfc_value(shadow_l1_pages), 
+            perfc_value(shadow_l2_pages));
+
+    n = d->arch.shadow_ht_extras;
+    while ( (x = n) != NULL )
+    {
+        d->arch.shadow_extras_count--;
+        n = *((struct shadow_status **)(&x[shadow_ht_extra_size]));
+        xfree(x);
+    }
+
+    d->arch.shadow_ht_extras = NULL;
+    d->arch.shadow_ht_free = NULL;
+
+    ASSERT(d->arch.shadow_extras_count == 0);
+    SH_LOG("freed extras, now %d", d->arch.shadow_extras_count);
+
+    if ( d->arch.shadow_dirty_bitmap != NULL )
+    {
+        xfree(d->arch.shadow_dirty_bitmap);
+        d->arch.shadow_dirty_bitmap = 0;
+        d->arch.shadow_dirty_bitmap_size = 0;
+    }
+
+    xfree(d->arch.shadow_ht);
+    d->arch.shadow_ht = NULL;
+}
+
+static void free_out_of_sync_entries(struct domain *d)
+{
+    struct out_of_sync_entry *x, *n;
+
+    n = d->arch.out_of_sync_extras;
+    while ( (x = n) != NULL )
+    {
+        d->arch.out_of_sync_extras_count--;
+        n = *((struct out_of_sync_entry **)(&x[out_of_sync_extra_size]));
+        xfree(x);
+    }
+
+    d->arch.out_of_sync_extras = NULL;
+    d->arch.out_of_sync_free = NULL;
+    d->arch.out_of_sync = NULL;
+
+    ASSERT(d->arch.out_of_sync_extras_count == 0);
+    FSH_LOG("freed extra out_of_sync entries, now %d",
+            d->arch.out_of_sync_extras_count);
+}
+
+void __shadow_mode_disable(struct domain *d)
+{
+    if ( unlikely(!shadow_mode_enabled(d)) )
+        return;
+
+    /*
+     * Currently this does not fix up page ref counts, so it is valid to call
+     * only when a domain is being destroyed.
+     */
+    BUG_ON(!test_bit(_DOMF_dying, &d->domain_flags) &&
+           shadow_mode_refcounts(d));
+    d->arch.shadow_tainted_refcnts = shadow_mode_refcounts(d);
+
+    free_shadow_pages(d);
+    free_writable_pte_predictions(d);
+
+#ifndef NDEBUG
+    int i;
+    for ( i = 0; i < shadow_ht_buckets; i++ )
+    {
+        if ( d->arch.shadow_ht[i].gpfn_and_flags != 0 )
+        {
+            printk("%s: d->arch.shadow_ht[%x].gpfn_and_flags=%lx\n",
+                   __FILE__, i, d->arch.shadow_ht[i].gpfn_and_flags);
+            BUG();
+        }
+    }
+#endif
+
+    d->arch.shadow_mode = 0;
+
+    free_shadow_ht_entries(d);
+    free_out_of_sync_entries(d);
+
+    struct vcpu *v;
+    for_each_vcpu(d, v)
+    {
+        update_pagetables(v);
+    }
+}
+
+static int shadow_mode_table_op(
+    struct domain *d, dom0_shadow_control_t *sc)
+{
+    unsigned int      op = sc->op;
+    int               i, rc = 0;
+    struct vcpu *v;
+
+    ASSERT(shadow_lock_is_acquired(d));
+
+    SH_VLOG("shadow mode table op %lx %lx count %d",
+            (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.guest_table),  
/* XXX SMP */
+            (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.shadow_table), 
/* XXX SMP */
+            d->arch.shadow_page_count);
+
+    shadow_audit(d, 1);
+
+    switch ( op )
+    {
+    case DOM0_SHADOW_CONTROL_OP_FLUSH:
+        free_shadow_pages(d);
+
+        d->arch.shadow_fault_count       = 0;
+        d->arch.shadow_dirty_count       = 0;
+        d->arch.shadow_dirty_net_count   = 0;
+        d->arch.shadow_dirty_block_count = 0;
+
+        break;
+   
+    case DOM0_SHADOW_CONTROL_OP_CLEAN:
+        free_shadow_pages(d);
+
+        sc->stats.fault_count       = d->arch.shadow_fault_count;
+        sc->stats.dirty_count       = d->arch.shadow_dirty_count;
+        sc->stats.dirty_net_count   = d->arch.shadow_dirty_net_count;
+        sc->stats.dirty_block_count = d->arch.shadow_dirty_block_count;
+
+        d->arch.shadow_fault_count       = 0;
+        d->arch.shadow_dirty_count       = 0;
+        d->arch.shadow_dirty_net_count   = 0;
+        d->arch.shadow_dirty_block_count = 0;
+ 
+        if ( (d->max_pages > sc->pages) || 
+             (sc->dirty_bitmap == NULL) || 
+             (d->arch.shadow_dirty_bitmap == NULL) )
+        {
+            rc = -EINVAL;
+            break;
+        }
+ 
+        sc->pages = d->max_pages;
+
+#define chunk (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
+        for ( i = 0; i < d->max_pages; i += chunk )
+        {
+            int bytes = ((((d->max_pages - i) > chunk) ?
+                          chunk : (d->max_pages - i)) + 7) / 8;
+     
+            if (copy_to_user(
+                    sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
+                    d->arch.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
+                    bytes))
+            {
+                // copy_to_user can fail when copying to guest app memory.
+                // app should zero buffer after mallocing, and pin it
+                rc = -EINVAL;
+                memset(
+                    d->arch.shadow_dirty_bitmap + 
+                    (i/(8*sizeof(unsigned long))),
+                    0, (d->max_pages/8) - (i/(8*sizeof(unsigned long))));
+                break;
+            }
+
+            memset(
+                d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
+                0, bytes);
+        }
+
+        break;
+
+    case DOM0_SHADOW_CONTROL_OP_PEEK:
+        sc->stats.fault_count       = d->arch.shadow_fault_count;
+        sc->stats.dirty_count       = d->arch.shadow_dirty_count;
+        sc->stats.dirty_net_count   = d->arch.shadow_dirty_net_count;
+        sc->stats.dirty_block_count = d->arch.shadow_dirty_block_count;
+ 
+        if ( (d->max_pages > sc->pages) || 
+             (sc->dirty_bitmap == NULL) || 
+             (d->arch.shadow_dirty_bitmap == NULL) )
+        {
+            rc = -EINVAL;
+            break;
+        }
+ 
+        sc->pages = d->max_pages;
+        if (copy_to_user(
+            sc->dirty_bitmap, d->arch.shadow_dirty_bitmap, (d->max_pages+7)/8))
+        {
+            rc = -EINVAL;
+            break;
+        }
+
+        break;
+
+    default:
+        rc = -EINVAL;
+        break;
+    }
+
+    SH_VLOG("shadow mode table op : page count %d", d->arch.shadow_page_count);
+    shadow_audit(d, 1);
+
+    for_each_vcpu(d,v)
+        __update_pagetables(v);
+
+    return rc;
+}
+
+int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
+{
+    unsigned int op = sc->op;
+    int          rc = 0;
+    struct vcpu *v;
+
+    if ( unlikely(d == current->domain) )
+    {
+        DPRINTK("Don't try to do a shadow op on yourself!\n");
+        return -EINVAL;
+    }   
+
+    domain_pause(d);
+
+    shadow_lock(d);
+
+    switch ( op )
+    {
+    case DOM0_SHADOW_CONTROL_OP_OFF:
+        __shadow_sync_all(d);
+        __shadow_mode_disable(d);
+        break;
+
+    case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
+        free_shadow_pages(d);
+        rc = __shadow_mode_enable(d, SHM_enable);
+        break;
+
+    case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
+        free_shadow_pages(d);
+        rc = __shadow_mode_enable(
+            d, d->arch.shadow_mode|SHM_enable|SHM_log_dirty);
+        break;
+
+    case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE:
+        free_shadow_pages(d);
+        rc = __shadow_mode_enable(
+            d, d->arch.shadow_mode|SHM_enable|SHM_refcounts|SHM_translate);
+        break;
+
+    default:
+        rc = shadow_mode_enabled(d) ? shadow_mode_table_op(d, sc) : -EINVAL;
+        break;
+    }
+
+    shadow_unlock(d);
+
+    for_each_vcpu(d,v)
+        update_pagetables(v);
+
+    domain_unpause(d);
+
+    return rc;
+}
+
+/*
+ * XXX KAF: Why is this VMX specific?
+ */
+void vmx_shadow_clear_state(struct domain *d)
+{
+    SH_VVLOG("%s:", __func__);
+    shadow_lock(d);
+    free_shadow_pages(d);
+    shadow_unlock(d);
+    update_pagetables(d->vcpu[0]);
+}
+
+unsigned long
+gpfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
+{
+    ASSERT( shadow_mode_translate(d) );
+
+    perfc_incrc(gpfn_to_mfn_foreign);
+
+    unsigned long va = gpfn << PAGE_SHIFT;
+    unsigned long tabpfn = pagetable_get_pfn(d->arch.phys_table);
+    l2_pgentry_t *l2 = map_domain_page(tabpfn);
+    l2_pgentry_t l2e = l2[l2_table_offset(va)];
+    unmap_domain_page(l2);
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
+    {
+        printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%lx) => 0 l2e=%" PRIpte 
"\n",
+               d->domain_id, gpfn, l2e_get_intpte(l2e));
+        return INVALID_MFN;
+    }
+    l1_pgentry_t *l1 = map_domain_page(l2e_get_pfn(l2e));
+    l1_pgentry_t l1e = l1[l1_table_offset(va)];
+    unmap_domain_page(l1);
+
+#if 0
+    printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%lx) => %lx tabpfn=%lx l2e=%lx 
l1tab=%lx, l1e=%lx\n",
+           d->domain_id, gpfn, l1_pgentry_val(l1e) >> PAGE_SHIFT, tabpfn, l2e, 
l1tab, l1e);
+#endif
+
+    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
+    {
+        printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%lx) => 0 l1e=%" PRIpte 
"\n",
+               d->domain_id, gpfn, l1e_get_intpte(l1e));
+        return INVALID_MFN;
+    }
+
+    return l1e_get_pfn(l1e);
+}
+
+static unsigned long
+shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn,
+                unsigned long smfn)
+{
+    unsigned long hl2mfn;
+    l1_pgentry_t *hl2;
+    int limit;
+
+    ASSERT(PGT_base_page_table == PGT_l2_page_table);
+
+    if ( unlikely(!(hl2mfn = alloc_shadow_page(d, gpfn, gmfn, 
PGT_hl2_shadow))) )
+    {
+        printk("Couldn't alloc an HL2 shadow for pfn=%lx mfn=%lx\n",
+               gpfn, gmfn);
+        BUG(); /* XXX Deal gracefully with failure. */
+    }
+
+    SH_VVLOG("shadow_hl2_table(gpfn=%lx, gmfn=%lx, smfn=%lx) => %lx",
+             gpfn, gmfn, smfn, hl2mfn);
+    perfc_incrc(shadow_hl2_table_count);
+
+    hl2 = map_domain_page(hl2mfn);
+
+#ifdef __i386__
+    if ( shadow_mode_external(d) )
+        limit = L2_PAGETABLE_ENTRIES;
+    else
+        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
+#else
+    limit = 0; /* XXX x86/64 XXX */
+#endif
+
+    memset(hl2, 0, limit * sizeof(l1_pgentry_t));
+
+    if ( !shadow_mode_external(d) )
+    {
+        memset(&hl2[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 0,
+               HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
+
+        // Setup easy access to the GL2, SL2, and HL2 frames.
+        //
+        hl2[l2_table_offset(LINEAR_PT_VIRT_START)] =
+            l1e_from_pfn(gmfn, __PAGE_HYPERVISOR);
+        hl2[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
+            l1e_from_pfn(smfn, __PAGE_HYPERVISOR);
+        hl2[l2_table_offset(PERDOMAIN_VIRT_START)] =
+            l1e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
+    }
+
+    unmap_domain_page(hl2);
+
+    return hl2mfn;
+}
+
+/*
+ * This could take and use a snapshot, and validate the entire page at
+ * once, or it could continue to fault in entries one at a time...
+ * Might be worth investigating...
+ */
+static unsigned long shadow_l2_table(
+    struct domain *d, unsigned long gpfn, unsigned long gmfn)
+{
+    unsigned long smfn;
+    l2_pgentry_t *spl2e;
+
+    SH_VVLOG("shadow_l2_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
+
+    perfc_incrc(shadow_l2_table_count);
+
+    if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l2_shadow))) )
+    {
+        printk("Couldn't alloc an L2 shadow for pfn=%lx mfn=%lx\n",
+               gpfn, gmfn);
+        BUG(); /* XXX Deal gracefully with failure. */
+    }
+
+    spl2e = (l2_pgentry_t *)map_domain_page(smfn);
+
+    /* Install hypervisor and 2x linear p.t. mapings. */
+    if ( (PGT_base_page_table == PGT_l2_page_table) &&
+         !shadow_mode_external(d) )
+    {
+        /*
+         * We could proactively fill in PDEs for pages that are already
+         * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
+         * (restriction required for coherence of the accessed bit). However,
+         * we tried it and it didn't help performance. This is simpler. 
+         */
+        memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t));
+
+        /* Install hypervisor and 2x linear p.t. mapings. */
+        memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
+               &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
+               HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
+
+        spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
+            l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
+
+        spl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
+            
l2e_from_paddr(__pa(page_get_owner(&frame_table[gmfn])->arch.mm_perdomain_pt),
+                            __PAGE_HYPERVISOR);
+
+        if ( shadow_mode_translate(d) ) // NB: not external
+        {
+            unsigned long hl2mfn;
+
+            spl2e[l2_table_offset(RO_MPT_VIRT_START)] =
+                l2e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
+                                __PAGE_HYPERVISOR);
+
+            if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, 
PGT_hl2_shadow))) )
+                hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
+
+            // shadow_mode_translate (but not external) sl2 tables hold a
+            // ref to their hl2.
+            //
+            if ( !get_shadow_ref(hl2mfn) )
+                BUG();
+            
+            spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
+                l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
+        }
+        else
+            spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
+                l2e_from_pfn(gmfn, __PAGE_HYPERVISOR);
+    }
+    else
+    {
+        memset(spl2e, 0, L2_PAGETABLE_ENTRIES*sizeof(l2_pgentry_t));        
+    }
+
+    unmap_domain_page(spl2e);
+
+    SH_VLOG("shadow_l2_table(%lx -> %lx)", gmfn, smfn);
+    return smfn;
+}
+
+void shadow_map_l1_into_current_l2(unsigned long va)
+{ 
+    struct vcpu *v = current;
+    struct domain *d = v->domain;
+    l1_pgentry_t *gpl1e, *spl1e;
+    l2_pgentry_t gl2e, sl2e;
+    unsigned long gl1pfn, gl1mfn, sl1mfn;
+    int i, init_table = 0;
+
+    __guest_get_l2e(v, va, &gl2e);
+    ASSERT(l2e_get_flags(gl2e) & _PAGE_PRESENT);
+    gl1pfn = l2e_get_pfn(gl2e);
+
+    if ( !(sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow)) )
+    {
+        /* This L1 is NOT already shadowed so we need to shadow it. */
+        SH_VVLOG("4a: l1 not shadowed");
+
+        gl1mfn = __gpfn_to_mfn(d, gl1pfn);
+        if ( unlikely(!VALID_MFN(gl1mfn)) )
+        {
+            // Attempt to use an invalid pfn as an L1 page.
+            // XXX this needs to be more graceful!
+            BUG();
+        }
+
+        if ( unlikely(!(sl1mfn =
+                        alloc_shadow_page(d, gl1pfn, gl1mfn, PGT_l1_shadow))) )
+        {
+            printk("Couldn't alloc an L1 shadow for pfn=%lx mfn=%lx\n",
+                   gl1pfn, gl1mfn);
+            BUG(); /* XXX Need to deal gracefully with failure. */
+        }
+
+        perfc_incrc(shadow_l1_table_count);
+        init_table = 1;
+    }
+    else
+    {
+        /* This L1 is shadowed already, but the L2 entry is missing. */
+        SH_VVLOG("4b: was shadowed, l2 missing (%lx)", sl1mfn);
+    }
+
+#ifndef NDEBUG
+    l2_pgentry_t old_sl2e;
+    __shadow_get_l2e(v, va, &old_sl2e);
+    ASSERT( !(l2e_get_flags(old_sl2e) & _PAGE_PRESENT) );
+#endif
+
+    if ( !get_shadow_ref(sl1mfn) )
+        BUG();
+    l2pde_general(d, &gl2e, &sl2e, sl1mfn);
+    __guest_set_l2e(v, va, gl2e);
+    __shadow_set_l2e(v, va, sl2e);
+
+    if ( init_table )
+    {
+        l1_pgentry_t sl1e;
+        int index = l1_table_offset(va);
+        int min = 1, max = 0;
+
+        gpl1e = &(linear_pg_table[l1_linear_offset(va) &
+                              ~(L1_PAGETABLE_ENTRIES-1)]);
+
+        spl1e = &(shadow_linear_pg_table[l1_linear_offset(va) &
+                                     ~(L1_PAGETABLE_ENTRIES-1)]);
+
+        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+        {
+            l1pte_propagate_from_guest(d, gpl1e[i], &sl1e);
+            if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
+                 unlikely(!shadow_get_page_from_l1e(sl1e, d)) )
+                sl1e = l1e_empty();
+            if ( l1e_get_flags(sl1e) == 0 )
+            {
+                // First copy entries from 0 until first invalid.
+                // Then copy entries from index until first invalid.
+                //
+                if ( i < index ) {
+                    i = index - 1;
+                    continue;
+                }
+                break;
+            }
+            spl1e[i] = sl1e;
+            if ( unlikely(i < min) )
+                min = i;
+            if ( likely(i > max) )
+                max = i;
+        }
+
+        frame_table[sl1mfn].tlbflush_timestamp =
+            SHADOW_ENCODE_MIN_MAX(min, max);
+    }
+}
+
+void shadow_invlpg(struct vcpu *v, unsigned long va)
+{
+    struct domain *d = v->domain;
+    l1_pgentry_t gpte, spte;
+
+    ASSERT(shadow_mode_enabled(d));
+
+    shadow_lock(d);
+
+    __shadow_sync_va(v, va);
+
+    // XXX mafetter: will need to think about 4MB pages...
+
+    // It's not strictly necessary to update the shadow here,
+    // but it might save a fault later.
+    //
+    if (__copy_from_user(&gpte, &linear_pg_table[va >> PAGE_SHIFT],
+                         sizeof(gpte))) {
+        perfc_incrc(shadow_invlpg_faults);
+        return;
+    }
+    l1pte_propagate_from_guest(d, gpte, &spte);
+    shadow_set_l1e(va, spte, 1);
+
+    shadow_unlock(d);
+}
+
+struct out_of_sync_entry *
+shadow_alloc_oos_entry(struct domain *d)
+{
+    struct out_of_sync_entry *f, *extra;
+    unsigned size, i;
+
+    if ( unlikely(d->arch.out_of_sync_free == NULL) )
+    {
+        FSH_LOG("Allocate more fullshadow tuple blocks.");
+
+        size = sizeof(void *) + (out_of_sync_extra_size * sizeof(*f));
+        extra = xmalloc_bytes(size);
+
+        /* XXX Should be more graceful here. */
+        if ( extra == NULL )
+            BUG();
+
+        memset(extra, 0, size);
+
+        /* Record the allocation block so it can be correctly freed later. */
+        d->arch.out_of_sync_extras_count++;
+        *((struct out_of_sync_entry **)&extra[out_of_sync_extra_size]) = 
+            d->arch.out_of_sync_extras;
+        d->arch.out_of_sync_extras = &extra[0];
+
+        /* Thread a free chain through the newly-allocated nodes. */
+        for ( i = 0; i < (out_of_sync_extra_size - 1); i++ )
+            extra[i].next = &extra[i+1];
+        extra[i].next = NULL;
+
+        /* Add the new nodes to the free list. */
+        d->arch.out_of_sync_free = &extra[0];
+    }
+
+    /* Allocate a new node from the quicklist. */
+    f = d->arch.out_of_sync_free;
+    d->arch.out_of_sync_free = f->next;
+
+    return f;
+}
+
+static inline unsigned long
+shadow_make_snapshot(
+    struct domain *d, unsigned long gpfn, unsigned long gmfn)
+{
+    unsigned long smfn, sl1mfn = 0;
+    void *original, *snapshot;
+    u32 min_max = 0;
+    int min, max, length;
+
+    if ( test_and_set_bit(_PGC_out_of_sync, &frame_table[gmfn].count_info) )
+    {
+        ASSERT(__shadow_status(d, gpfn, PGT_snapshot));
+        return SHADOW_SNAPSHOT_ELSEWHERE;
+    }
+
+    perfc_incrc(shadow_make_snapshot);
+
+    if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_snapshot))) )
+    {
+        printk("Couldn't alloc fullshadow snapshot for pfn=%lx mfn=%lx!\n"
+               "Dom%d snapshot_count_count=%d\n",
+               gpfn, gmfn, d->domain_id, d->arch.snapshot_page_count);
+        BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
+    }
+
+    if ( !get_shadow_ref(smfn) )
+        BUG();
+
+    if ( shadow_mode_refcounts(d) &&
+         (shadow_max_pgtable_type(d, gpfn, &sl1mfn) == PGT_l1_shadow) )
+        min_max = pfn_to_page(sl1mfn)->tlbflush_timestamp;
+    pfn_to_page(smfn)->tlbflush_timestamp = min_max;
+
+    min = SHADOW_MIN(min_max);
+    max = SHADOW_MAX(min_max);
+    length = max - min + 1;
+    perfc_incr_histo(snapshot_copies, length, PT_UPDATES);
+
+    min *= sizeof(l1_pgentry_t);
+    length *= sizeof(l1_pgentry_t);
+
+    original = map_domain_page(gmfn);
+    snapshot = map_domain_page(smfn);
+    memcpy(snapshot + min, original + min, length);
+    unmap_domain_page(original);
+    unmap_domain_page(snapshot);
+
+    return smfn;
+}
+
+static void
+shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry)
+{
+    void *snapshot;
+
+    if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
+        return;
+
+    // Clear the out_of_sync bit.
+    //
+    clear_bit(_PGC_out_of_sync, &frame_table[entry->gmfn].count_info);
+
+    // XXX Need to think about how to protect the domain's
+    // information less expensively.
+    //
+    snapshot = map_domain_page(entry->snapshot_mfn);
+    memset(snapshot, 0, PAGE_SIZE);
+    unmap_domain_page(snapshot);
+
+    put_shadow_ref(entry->snapshot_mfn);
+}
+
+struct out_of_sync_entry *
+shadow_mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
+                             unsigned long mfn)
+{
+    struct domain *d = v->domain;
+    struct pfn_info *page = &frame_table[mfn];
+    struct out_of_sync_entry *entry = shadow_alloc_oos_entry(d);
+
+    ASSERT(shadow_lock_is_acquired(d));
+    ASSERT(pfn_valid(mfn));
+
+#ifndef NDEBUG
+    u32 type = page->u.inuse.type_info & PGT_type_mask;
+    if ( shadow_mode_refcounts(d) )
+    {
+        ASSERT(type == PGT_writable_page);
+    }
+    else
+    {
+        ASSERT(type && (type < PGT_l4_page_table));
+    }
+#endif
+
+    FSH_LOG("%s(gpfn=%lx, mfn=%lx) c=%08x t=%08x", __func__,
+            gpfn, mfn, page->count_info, page->u.inuse.type_info);
+
+    // XXX this will require some more thought...  Cross-domain sharing and
+    //     modification of page tables?  Hmm...
+    //
+    if ( d != page_get_owner(page) )
+        BUG();
+
+    perfc_incrc(shadow_mark_mfn_out_of_sync_calls);
+
+    entry->gpfn = gpfn;
+    entry->gmfn = mfn;
+    entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
+    entry->writable_pl1e = -1;
+
+#if SHADOW_DEBUG
+    mark_shadows_as_reflecting_snapshot(d, gpfn);
+#endif
+
+    // increment guest's ref count to represent the entry in the
+    // full shadow out-of-sync list.
+    //
+    get_page(page, d);
+
+    // Add to the out-of-sync list
+    //
+    entry->next = d->arch.out_of_sync;
+    d->arch.out_of_sync = entry;
+
+    return entry;
+}
+
+void shadow_mark_va_out_of_sync(
+    struct vcpu *v, unsigned long gpfn, unsigned long mfn, unsigned long va)
+{
+    struct out_of_sync_entry *entry =
+        shadow_mark_mfn_out_of_sync(v, gpfn, mfn);
+    l2_pgentry_t sl2e;
+
+    // We need the address of shadow PTE that maps @va.
+    // It might not exist yet.  Make sure it's there.
+    //
+    __shadow_get_l2e(v, va, &sl2e);
+    if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
+    {
+        // either this L1 isn't shadowed yet, or the shadow isn't linked into
+        // the current L2.
+        shadow_map_l1_into_current_l2(va);
+        __shadow_get_l2e(v, va, &sl2e);
+    }
+    ASSERT(l2e_get_flags(sl2e) & _PAGE_PRESENT);
+
+    // NB: this is stored as a machine address.
+    entry->writable_pl1e =
+        l2e_get_paddr(sl2e) | (sizeof(l1_pgentry_t) * l1_table_offset(va));
+    ASSERT( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) );
+
+    // Increment shadow's page count to represent the reference
+    // inherent in entry->writable_pl1e
+    //
+    if ( !get_shadow_ref(l2e_get_pfn(sl2e)) )
+        BUG();
+
+    FSH_LOG("mark_out_of_sync(va=%lx -> writable_pl1e=%lx)",
+            va, entry->writable_pl1e);
+}
+
+/*
+ * Returns 1 if the snapshot for @gmfn exists and its @index'th entry matches.
+ * Returns 0 otherwise.
+ */
+static int snapshot_entry_matches(
+    struct domain *d, l1_pgentry_t *guest_pt,
+    unsigned long gpfn, unsigned index)
+{
+    unsigned long smfn = __shadow_status(d, gpfn, PGT_snapshot);
+    l1_pgentry_t *snapshot, gpte; // could be L1s or L2s or ...
+    int entries_match;
+
+    perfc_incrc(snapshot_entry_matches_calls);
+
+    if ( !smfn )
+        return 0;
+
+    snapshot = map_domain_page(smfn);
+
+    if (__copy_from_user(&gpte, &guest_pt[index],
+                         sizeof(gpte)))
+        return 0;
+
+    // This could probably be smarter, but this is sufficent for
+    // our current needs.
+    //
+    entries_match = !l1e_has_changed(gpte, snapshot[index],
+                                     PAGE_FLAG_MASK);
+
+    unmap_domain_page(snapshot);
+
+#ifdef PERF_COUNTERS
+    if ( entries_match )
+        perfc_incrc(snapshot_entry_matches_true);
+#endif
+
+    return entries_match;
+}
+
+/*
+ * Returns 1 if va's shadow mapping is out-of-sync.
+ * Returns 0 otherwise.
+ */
+int __shadow_out_of_sync(struct vcpu *v, unsigned long va)
+{
+    struct domain *d = v->domain;
+    unsigned long l2mfn = pagetable_get_pfn(v->arch.guest_table);
+    unsigned long l2pfn = __mfn_to_gpfn(d, l2mfn);
+    l2_pgentry_t l2e;
+    unsigned long l1pfn, l1mfn;
+
+    ASSERT(shadow_lock_is_acquired(d));
+    ASSERT(VALID_M2P(l2pfn));
+
+    perfc_incrc(shadow_out_of_sync_calls);
+
+    if ( page_out_of_sync(&frame_table[l2mfn]) &&
+         !snapshot_entry_matches(d, (l1_pgentry_t *)v->arch.guest_vtable,
+                                 l2pfn, l2_table_offset(va)) )
+        return 1;
+
+    __guest_get_l2e(v, va, &l2e);
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
+        return 0;
+
+    l1pfn = l2e_get_pfn(l2e);
+    l1mfn = __gpfn_to_mfn(d, l1pfn);
+
+    // If the l1 pfn is invalid, it can't be out of sync...
+    if ( !VALID_MFN(l1mfn) )
+        return 0;
+
+    if ( page_out_of_sync(&frame_table[l1mfn]) &&
+         !snapshot_entry_matches(
+             d, &linear_pg_table[l1_linear_offset(va) & 
~(L1_PAGETABLE_ENTRIES-1)],
+             l1pfn, l1_table_offset(va)) )
+        return 1;
+
+    return 0;
+}
+
+#define GPFN_TO_GPTEPAGE(_gpfn) ((_gpfn) / (PAGE_SIZE / sizeof(l1_pgentry_t)))
+static inline unsigned long
+predict_writable_pte_page(struct domain *d, unsigned long gpfn)
+{
+    return __shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), PGT_writable_pred);
+}
+
+static inline void
+increase_writable_pte_prediction(struct domain *d, unsigned long gpfn, 
unsigned long prediction)
+{
+    unsigned long score = prediction & PGT_score_mask;
+    int create = (score == 0);
+
+    // saturating addition
+    score = (score + (1u << PGT_score_shift)) & PGT_score_mask;
+    score = score ? score : PGT_score_mask;
+
+    prediction = (prediction & PGT_mfn_mask) | score;
+
+    //printk("increase gpfn=%lx pred=%lx create=%d\n", gpfn, prediction, 
create);
+    set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, 
PGT_writable_pred);
+
+    if ( create )
+        perfc_incr(writable_pte_predictions);
+}
+
+static inline void
+decrease_writable_pte_prediction(struct domain *d, unsigned long gpfn, 
unsigned long prediction)
+{
+    unsigned long score = prediction & PGT_score_mask;
+    ASSERT(score);
+
+    // divide score by 2...  We don't like bad predictions.
+    //
+    score = (score >> 1) & PGT_score_mask;
+
+    prediction = (prediction & PGT_mfn_mask) | score;
+
+    //printk("decrease gpfn=%lx pred=%lx score=%lx\n", gpfn, prediction, 
score);
+
+    if ( score )
+        set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, 
PGT_writable_pred);
+    else
+    {
+        delete_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, PGT_writable_pred);
+        perfc_decr(writable_pte_predictions);
+    }
+}
+
+static void
+free_writable_pte_predictions(struct domain *d)
+{
+    int i;
+    struct shadow_status *x;
+
+    for ( i = 0; i < shadow_ht_buckets; i++ )
+    {
+        u32 count;
+        unsigned long *gpfn_list;
+
+        /* Skip empty buckets. */
+        if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
+            continue;
+
+        count = 0;
+        for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
+            if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
+                count++;
+
+        gpfn_list = xmalloc_array(unsigned long, count);
+        count = 0;
+        for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
+            if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
+                gpfn_list[count++] = x->gpfn_and_flags & PGT_mfn_mask;
+
+        while ( count )
+        {
+            count--;
+            delete_shadow_status(d, gpfn_list[count], 0, PGT_writable_pred);
+        }
+
+        xfree(gpfn_list);
+    }
+}
+
+static u32 remove_all_write_access_in_ptpage(
+    struct domain *d, unsigned long pt_pfn, unsigned long pt_mfn,
+    unsigned long readonly_gpfn, unsigned long readonly_gmfn,
+    u32 max_refs_to_find, unsigned long prediction)
+{
+    l1_pgentry_t *pt = map_domain_page(pt_mfn);
+    l1_pgentry_t match;
+    unsigned long flags = _PAGE_RW | _PAGE_PRESENT;
+    int i;
+    u32 found = 0;
+    int is_l1_shadow =
+        ((frame_table[pt_mfn].u.inuse.type_info & PGT_type_mask) ==
+         PGT_l1_shadow);
+
+    match = l1e_from_pfn(readonly_gmfn, flags);
+
+    // returns true if all refs have been found and fixed.
+    //
+    int fix_entry(int i)
+    {
+        l1_pgentry_t old = pt[i];
+        l1_pgentry_t new = old;
+
+        l1e_remove_flags(new,_PAGE_RW);
+        if ( is_l1_shadow && !shadow_get_page_from_l1e(new, d) )
+            BUG();
+        found++;
+        pt[i] = new;
+        if ( is_l1_shadow )
+            shadow_put_page_from_l1e(old, d);
+
+#if 0
+        printk("removed write access to pfn=%lx mfn=%lx in smfn=%lx entry %x "
+               "is_l1_shadow=%d\n",
+               readonly_gpfn, readonly_gmfn, pt_mfn, i, is_l1_shadow);
+#endif
+
+        return (found == max_refs_to_find);
+    }
+
+    i = readonly_gpfn & (L1_PAGETABLE_ENTRIES - 1);
+    if ( !l1e_has_changed(pt[i], match, flags) && fix_entry(i) )
+    {
+        perfc_incrc(remove_write_fast_exit);
+        increase_writable_pte_prediction(d, readonly_gpfn, prediction);
+        unmap_domain_page(pt);
+        return found;
+    }
+ 
+    for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
+    {
+        if ( unlikely(!l1e_has_changed(pt[i], match, flags)) && fix_entry(i) )
+            break;
+    }
+
+    unmap_domain_page(pt);
+
+    return found;
+#undef MATCH_ENTRY
+}
+
+int shadow_remove_all_write_access(
+    struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn)
+{
+    int i;
+    struct shadow_status *a;
+    u32 found = 0, fixups, write_refs;
+    unsigned long prediction, predicted_gpfn, predicted_smfn;
+
+    ASSERT(shadow_lock_is_acquired(d));
+    ASSERT(VALID_MFN(readonly_gmfn));
+
+    perfc_incrc(remove_write_access);
+
+    // If it's not a writable page, then no writable refs can be outstanding.
+    //
+    if ( (frame_table[readonly_gmfn].u.inuse.type_info & PGT_type_mask) !=
+         PGT_writable_page )
+    {
+        perfc_incrc(remove_write_not_writable);
+        return 1;
+    }
+
+    // How many outstanding writable PTEs for this page are there?
+    //
+    write_refs =
+        (frame_table[readonly_gmfn].u.inuse.type_info & PGT_count_mask);
+    if ( write_refs && MFN_PINNED(readonly_gmfn) )
+    {
+        write_refs--;
+    }
+
+    if ( write_refs == 0 )
+    {
+        perfc_incrc(remove_write_no_work);
+        return 1;
+    }
+
+    // Before searching all the L1 page tables, check the typical culprit first
+    //
+    if ( (prediction = predict_writable_pte_page(d, readonly_gpfn)) )
+    {
+        predicted_gpfn = prediction & PGT_mfn_mask;
+        if ( (predicted_smfn = __shadow_status(d, predicted_gpfn, 
PGT_l1_shadow)) &&
+             (fixups = remove_all_write_access_in_ptpage(d, predicted_gpfn, 
predicted_smfn, readonly_gpfn, readonly_gmfn, write_refs, prediction)) )
+        {
+            found += fixups;
+            if ( found == write_refs )
+            {
+                perfc_incrc(remove_write_predicted);
+                return 1;
+            }
+        }
+        else
+        {
+            perfc_incrc(remove_write_bad_prediction);
+            decrease_writable_pte_prediction(d, readonly_gpfn, prediction);
+        }
+    }
+
+    // Search all the shadow L1 page tables...
+    //
+    for (i = 0; i < shadow_ht_buckets; i++)
+    {
+        a = &d->arch.shadow_ht[i];
+        while ( a && a->gpfn_and_flags )
+        {
+            if ( (a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow )
+            {
+                found += remove_all_write_access_in_ptpage(d, 
a->gpfn_and_flags & PGT_mfn_mask, a->smfn, readonly_gpfn, readonly_gmfn, 
write_refs - found, a->gpfn_and_flags & PGT_mfn_mask);
+                if ( found == write_refs )
+                    return 1;
+            }
+
+            a = a->next;
+        }
+    }
+
+    FSH_LOG("%s: looking for %d refs, found %d refs",
+            __func__, write_refs, found);
+
+    return 0;
+}
+
+static u32 remove_all_access_in_page(
+    struct domain *d, unsigned long l1mfn, unsigned long forbidden_gmfn)
+{
+    l1_pgentry_t *pl1e = map_domain_page(l1mfn);
+    l1_pgentry_t match;
+    unsigned long flags  = _PAGE_PRESENT;
+    int i;
+    u32 count = 0;
+    int is_l1_shadow =
+        ((frame_table[l1mfn].u.inuse.type_info & PGT_type_mask) ==
+         PGT_l1_shadow);
+
+    match = l1e_from_pfn(forbidden_gmfn, flags);
+    
+    for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
+    {
+        if ( unlikely(!l1e_has_changed(pl1e[i], match, flags) == 0) )
+        {
+            l1_pgentry_t ol2e = pl1e[i];
+            pl1e[i] = l1e_empty();
+            count++;
+
+            if ( is_l1_shadow )
+                shadow_put_page_from_l1e(ol2e, d);
+            else /* must be an hl2 page */
+                put_page(&frame_table[forbidden_gmfn]);
+        }
+    }
+
+    unmap_domain_page(pl1e);
+
+    return count;
+}
+
+u32 shadow_remove_all_access(struct domain *d, unsigned long forbidden_gmfn)
+{
+    int i;
+    struct shadow_status *a;
+    u32 count = 0;
+
+    if ( unlikely(!shadow_mode_enabled(d)) )
+        return 0;
+
+    ASSERT(shadow_lock_is_acquired(d));
+    perfc_incrc(remove_all_access);
+
+    for (i = 0; i < shadow_ht_buckets; i++)
+    {
+        a = &d->arch.shadow_ht[i];
+        while ( a && a->gpfn_and_flags )
+        {
+            switch (a->gpfn_and_flags & PGT_type_mask)
+            {
+            case PGT_l1_shadow:
+            case PGT_l2_shadow:
+            case PGT_l3_shadow:
+            case PGT_l4_shadow:
+            case PGT_hl2_shadow:
+                count += remove_all_access_in_page(d, a->smfn, forbidden_gmfn);
+                break;
+            case PGT_snapshot:
+            case PGT_writable_pred:
+                // these can't hold refs to the forbidden page
+                break;
+            default:
+                BUG();
+            }
+
+            a = a->next;
+        }
+    }
+
+    return count;
+}    
+
+static int resync_all(struct domain *d, u32 stype)
+{
+    struct out_of_sync_entry *entry;
+    unsigned i;
+    unsigned long smfn;
+    void *guest, *shadow, *snapshot;
+    int need_flush = 0, external = shadow_mode_external(d);
+    int unshadow;
+    int changed;
+
+    ASSERT(shadow_lock_is_acquired(d));
+
+    for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
+    {
+        if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
+            continue;
+
+        smfn = __shadow_status(d, entry->gpfn, stype);
+
+        if ( !smfn )
+        {
+            if ( shadow_mode_refcounts(d) )
+                continue;
+
+            // For light weight shadows, even when no shadow page exists,
+            // we need to resync the refcounts to the new contents of the
+            // guest page.
+            // This only applies when we have writable page tables.
+            //
+            if ( !shadow_mode_write_all(d) &&
+                 !((stype == PGT_l1_shadow) &&
+                   VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
+                // Page is not writable -- no resync necessary
+                continue;
+        }
+
+        FSH_LOG("resyncing t=%08x gpfn=%lx gmfn=%lx smfn=%lx snapshot_mfn=%lx",
+                stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn);
+
+        // Compare guest's new contents to its snapshot, validating
+        // and updating its shadow as appropriate.
+        //
+        guest    = map_domain_page(entry->gmfn);
+        snapshot = map_domain_page(entry->snapshot_mfn);
+
+        if ( smfn )
+            shadow = map_domain_page(smfn);
+        else
+            shadow = NULL;
+
+        unshadow = 0;
+
+        switch ( stype ) {
+        case PGT_l1_shadow:
+        {
+            l1_pgentry_t *guest1 = guest;
+            l1_pgentry_t *shadow1 = shadow;
+            l1_pgentry_t *snapshot1 = snapshot;
+
+            ASSERT(VM_ASSIST(d, VMASST_TYPE_writable_pagetables) ||
+                   shadow_mode_write_all(d));
+
+            if ( !shadow_mode_refcounts(d) )
+                revalidate_l1(d, guest1, snapshot1);
+
+            if ( !smfn )
+                break;
+
+            u32 min_max_shadow = pfn_to_page(smfn)->tlbflush_timestamp;
+            int min_shadow = SHADOW_MIN(min_max_shadow);
+            int max_shadow = SHADOW_MAX(min_max_shadow);
+
+            u32 min_max_snapshot =
+                pfn_to_page(entry->snapshot_mfn)->tlbflush_timestamp;
+            int min_snapshot = SHADOW_MIN(min_max_snapshot);
+            int max_snapshot = SHADOW_MAX(min_max_snapshot);
+
+            changed = 0;
+
+            for ( i = min_shadow; i <= max_shadow; i++ )
+            {
+                if ( (i < min_snapshot) || (i > max_snapshot) ||
+                     l1e_has_changed(guest1[i], snapshot1[i], PAGE_FLAG_MASK) )
+                {
+                    need_flush |= validate_pte_change(d, guest1[i], 
&shadow1[i]);
+
+                    // can't update snapshots of linear page tables -- they
+                    // are used multiple times...
+                    //
+                    // snapshot[i] = new_pte;
+
+                    changed++;
+                }
+            }
+            perfc_incrc(resync_l1);
+            perfc_incr_histo(wpt_updates, changed, PT_UPDATES);
+            perfc_incr_histo(l1_entries_checked, max_shadow - min_shadow + 1, 
PT_UPDATES);
+            break;
+        }
+        case PGT_l2_shadow:
+        {
+            int max = -1;
+
+            l2_pgentry_t *guest2 = guest;
+            l2_pgentry_t *shadow2 = shadow;
+            l2_pgentry_t *snapshot2 = snapshot;
+
+            ASSERT(shadow_mode_write_all(d));
+            BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
+
+            changed = 0;
+            for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
+            {
+#if CONFIG_X86_PAE
+                BUG();  /* FIXME: need type_info */
+#endif
+                if ( !is_guest_l2_slot(0,i) && !external )
+                    continue;
+
+                l2_pgentry_t new_pde = guest2[i];
+                if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK))
+                {
+                    need_flush |= validate_pde_change(d, new_pde, &shadow2[i]);
+
+                    // can't update snapshots of linear page tables -- they
+                    // are used multiple times...
+                    //
+                    // snapshot[i] = new_pde;
+
+                    changed++;
+                }
+                if ( l2e_get_intpte(new_pde) != 0 ) /* FIXME: check flags? */
+                    max = i;
+
+                // XXX - This hack works for linux guests.
+                //       Need a better solution long term.
+                if ( !(l2e_get_flags(new_pde) & _PAGE_PRESENT) &&
+                     unlikely(l2e_get_intpte(new_pde) != 0) &&
+                     !unshadow && MFN_PINNED(smfn) )
+                    unshadow = 1;
+            }
+            if ( max == -1 )
+                unshadow = 1;
+            perfc_incrc(resync_l2);
+            perfc_incr_histo(shm_l2_updates, changed, PT_UPDATES);
+            break;
+        }
+        case PGT_hl2_shadow:
+        {
+            l2_pgentry_t *guest2 = guest;
+            l2_pgentry_t *snapshot2 = snapshot;
+            l1_pgentry_t *shadow2 = shadow;
+            
+            ASSERT(shadow_mode_write_all(d));
+            BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
+
+            changed = 0;
+            for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
+            {
+#if CONFIG_X86_PAE
+                BUG();  /* FIXME: need type_info */
+#endif
+                if ( !is_guest_l2_slot(0, i) && !external )
+                    continue;
+
+                l2_pgentry_t new_pde = guest2[i];
+                if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK) )
+                {
+                    need_flush |= validate_hl2e_change(d, new_pde, 
&shadow2[i]);
+
+                    // can't update snapshots of linear page tables -- they
+                    // are used multiple times...
+                    //
+                    // snapshot[i] = new_pde;
+
+                    changed++;
+                }
+            }
+            perfc_incrc(resync_hl2);
+            perfc_incr_histo(shm_hl2_updates, changed, PT_UPDATES);
+            break;
+        }
+        default:
+            BUG();
+        }
+
+        if ( smfn )
+            unmap_domain_page(shadow);
+        unmap_domain_page(snapshot);
+        unmap_domain_page(guest);
+
+        if ( unlikely(unshadow) )
+        {
+            perfc_incrc(unshadow_l2_count);
+            shadow_unpin(smfn);
+            if ( unlikely(shadow_mode_external(d)) )
+            {
+                unsigned long hl2mfn;
+
+                if ( (hl2mfn = __shadow_status(d, entry->gpfn, 
PGT_hl2_shadow)) &&
+                     MFN_PINNED(hl2mfn) )
+                    shadow_unpin(hl2mfn);
+            }
+        }
+    }
+
+    return need_flush;
+}
+
+void __shadow_sync_all(struct domain *d)
+{
+    struct out_of_sync_entry *entry;
+    int need_flush = 0;
+
+    perfc_incrc(shadow_sync_all);
+
+    ASSERT(shadow_lock_is_acquired(d));
+
+    // First, remove all write permissions to the page tables
+    //
+    for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
+    {
+        // Skip entries that have low bits set...  Those aren't
+        // real PTEs.
+        //
+        if ( entry->writable_pl1e & (sizeof(l1_pgentry_t)-1) )
+            continue;
+
+        l1_pgentry_t *ppte = (l1_pgentry_t *)(
+            (char *)map_domain_page(entry->writable_pl1e >> PAGE_SHIFT) +
+            (entry->writable_pl1e & ~PAGE_MASK));
+        l1_pgentry_t opte = *ppte;
+        l1_pgentry_t npte = opte;
+        l1e_remove_flags(npte, _PAGE_RW);
+
+        if ( (l1e_get_flags(npte) & _PAGE_PRESENT) &&
+             !shadow_get_page_from_l1e(npte, d) )
+            BUG();
+        *ppte = npte;
+        shadow_put_page_from_l1e(opte, d);
+
+        unmap_domain_page(ppte);
+    }
+
+    // XXX mafetter: SMP
+    //
+    // With the current algorithm, we've gotta flush all the TLBs
+    // before we can safely continue.  I don't think we want to
+    // do it this way, so I think we should consider making
+    // entirely private copies of the shadow for each vcpu, and/or
+    // possibly having a mix of private and shared shadow state
+    // (any path from a PTE that grants write access to an out-of-sync
+    // page table page needs to be vcpu private).
+    //
+#if 0 // this should be enabled for SMP guests...
+    flush_tlb_mask(cpu_online_map);
+#endif
+    need_flush = 1;
+
+    // Second, resync all L1 pages, then L2 pages, etc...
+    //
+    need_flush |= resync_all(d, PGT_l1_shadow);
+    if ( shadow_mode_translate(d) )
+        need_flush |= resync_all(d, PGT_hl2_shadow);
+    need_flush |= resync_all(d, PGT_l2_shadow);
+
+    if ( need_flush && !unlikely(shadow_mode_external(d)) )
+        local_flush_tlb();
+
+    free_out_of_sync_state(d);
+}
+
+int shadow_fault(unsigned long va, struct cpu_user_regs *regs)
+{
+    l1_pgentry_t gpte, spte, orig_gpte;
+    struct vcpu *v = current;
+    struct domain *d = v->domain;
+    l2_pgentry_t gpde;
+
+    spte = l1e_empty();
+
+    SH_VVLOG("shadow_fault( va=%lx, code=%lu )",
+             va, (unsigned long)regs->error_code);
+    perfc_incrc(shadow_fault_calls);
+    
+    check_pagetable(v, "pre-sf");
+
+    /*
+     * Don't let someone else take the guest's table pages out-of-sync.
+     */
+    shadow_lock(d);
+
+    /* XXX - FIX THIS COMMENT!!!
+     * STEP 1. Check to see if this fault might have been caused by an
+     *         out-of-sync table page entry, or if we should pass this
+     *         fault onto the guest.
+     */
+    __shadow_sync_va(v, va);
+
+    /*
+     * STEP 2. Check the guest PTE.
+     */
+    __guest_get_l2e(v, va, &gpde);
+    if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) )
+    {
+        SH_VVLOG("shadow_fault - EXIT: L1 not present");
+        perfc_incrc(shadow_fault_bail_pde_not_present);
+        goto fail;
+    }
+
+    // This can't fault because we hold the shadow lock and we've ensured that
+    // the mapping is in-sync, so the check of the PDE's present bit, above,
+    // covers this access.
+    //
+    orig_gpte = gpte = linear_pg_table[l1_linear_offset(va)];
+    if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_PRESENT)) )
+    {
+        SH_VVLOG("shadow_fault - EXIT: gpte not present (%" PRIpte ")",
+                 l1e_get_intpte(gpte));
+        perfc_incrc(shadow_fault_bail_pte_not_present);
+        goto fail;
+    }
+
+    /* Write fault? */
+    if ( regs->error_code & 2 )  
+    {
+        int allow_writes = 0;
+
+        if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_RW)) )
+        {
+            if ( shadow_mode_page_writable(d, l1e_get_pfn(gpte)) )
+            {
+                allow_writes = 1;
+                l1e_add_flags(gpte, _PAGE_RW);
+            }
+            else
+            {
+                /* Write fault on a read-only mapping. */
+                SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%" PRIpte 
")", 
+                         l1e_get_intpte(gpte));
+                perfc_incrc(shadow_fault_bail_ro_mapping);
+                goto fail;
+            }
+        }
+
+        if ( !l1pte_write_fault(v, &gpte, &spte, va) )
+        {
+            SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed");
+            perfc_incrc(write_fault_bail);
+            shadow_unlock(d);
+            return 0;
+        }
+
+        if ( allow_writes )
+            l1e_remove_flags(gpte, _PAGE_RW);
+    }
+    else
+    {
+        if ( !l1pte_read_fault(d, &gpte, &spte) )
+        {
+            SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed");
+            perfc_incrc(read_fault_bail);
+            shadow_unlock(d);
+            return 0;
+        }
+    }
+
+    /*
+     * STEP 3. Write the modified shadow PTE and guest PTE back to the tables.
+     */
+    if ( l1e_has_changed(orig_gpte, gpte, PAGE_FLAG_MASK) )
+    {
+        /* XXX Watch out for read-only L2 entries! (not used in Linux). */
+        if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)],
+                                     &gpte, sizeof(gpte))) )
+        {
+            printk("%s() failed, crashing domain %d "
+                   "due to a read-only L2 page table (gpde=%" PRIpte "), 
va=%lx\n",
+                   __func__,d->domain_id, l2e_get_intpte(gpde), va);
+            domain_crash_synchronous();
+        }
+
+        // if necessary, record the page table page as dirty
+        if ( unlikely(shadow_mode_log_dirty(d)) )
+            __mark_dirty(d, __gpfn_to_mfn(d, l2e_get_pfn(gpde)));
+    }
+
+    shadow_set_l1e(va, spte, 1);
+
+    perfc_incrc(shadow_fault_fixed);
+    d->arch.shadow_fault_count++;
+
+    shadow_unlock(d);
+
+    check_pagetable(v, "post-sf");
+    return EXCRET_fault_fixed;
+
+ fail:
+    shadow_unlock(d);
+    return 0;
+}
+
+void shadow_l1_normal_pt_update(
+    struct domain *d,
+    unsigned long pa, l1_pgentry_t gpte,
+    struct domain_mmap_cache *cache)
+{
+    unsigned long sl1mfn;    
+    l1_pgentry_t *spl1e, spte;
+
+    shadow_lock(d);
+
+    sl1mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l1_shadow);
+    if ( sl1mfn )
+    {
+        SH_VVLOG("shadow_l1_normal_pt_update pa=%p, gpte=%" PRIpte,
+                 (void *)pa, l1e_get_intpte(gpte));
+        l1pte_propagate_from_guest(current->domain, gpte, &spte);
+
+        spl1e = map_domain_page_with_cache(sl1mfn, cache);
+        spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = spte;
+        unmap_domain_page_with_cache(spl1e, cache);
+    }
+
+    shadow_unlock(d);
+}
+
+void shadow_l2_normal_pt_update(
+    struct domain *d,
+    unsigned long pa, l2_pgentry_t gpde,
+    struct domain_mmap_cache *cache)
+{
+    unsigned long sl2mfn;
+    l2_pgentry_t *spl2e;
+
+    shadow_lock(d);
+
+    sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l2_shadow);
+    if ( sl2mfn )
+    {
+        SH_VVLOG("shadow_l2_normal_pt_update pa=%p, gpde=%" PRIpte,
+                 (void *)pa, l2e_get_intpte(gpde));
+        spl2e = map_domain_page_with_cache(sl2mfn, cache);
+        validate_pde_change(d, gpde,
+                            &spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)]);
+        unmap_domain_page_with_cache(spl2e, cache);
+    }
+
+    shadow_unlock(d);
+}
+
+#if CONFIG_PAGING_LEVELS >= 3
+void shadow_l3_normal_pt_update(
+    struct domain *d,
+    unsigned long pa, l3_pgentry_t gpde,
+    struct domain_mmap_cache *cache)
+{
+    BUG(); // not yet implemented
+}
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 4
+void shadow_l4_normal_pt_update(
+    struct domain *d,
+    unsigned long pa, l4_pgentry_t gpde,
+    struct domain_mmap_cache *cache)
+{
+    BUG(); // not yet implemented
+}
+#endif
+
+int shadow_do_update_va_mapping(unsigned long va,
+                                l1_pgentry_t val,
+                                struct vcpu *v)
+{
+    struct domain *d = v->domain;
+    l1_pgentry_t spte;
+    int rc = 0;
+
+    shadow_lock(d);
+
+    //printk("%s(va=%p, val=%p)\n", __func__, (void *)va, (void 
*)l1e_get_intpte(val));
+        
+    // This is actually overkill - we don't need to sync the L1 itself,
+    // just everything involved in getting to this L1 (i.e. we need
+    // linear_pg_table[l1_linear_offset(va)] to be in sync)...
+    //
+    __shadow_sync_va(v, va);
+
+    l1pte_propagate_from_guest(d, val, &spte);
+    shadow_set_l1e(va, spte, 0);
+
+    /*
+     * If we're in log-dirty mode then we need to note that we've updated
+     * the PTE in the PT-holding page. We need the machine frame number
+     * for this.
+     */
+    if ( shadow_mode_log_dirty(d) )
+        __mark_dirty(d, va_to_l1mfn(v, va));
+
+// out:
+    shadow_unlock(d);
+
+    return rc;
+}
+
+
+/*
+ * What lives where in the 32-bit address space in the various shadow modes,
+ * and what it uses to get/maintain that mapping.
+ *
+ * SHADOW MODE:      none         enable         translate         external
+ * 
+ * 4KB things:
+ * guest_vtable    lin_l2     mapped per gl2   lin_l2 via hl2   mapped per gl2
+ * shadow_vtable     n/a         sh_lin_l2       sh_lin_l2      mapped per gl2
+ * hl2_vtable        n/a            n/a        lin_hl2 via hl2  mapped per gl2
+ * monitor_vtable    n/a            n/a             n/a           mapped once
+ *
+ * 4MB things:
+ * guest_linear  lin via gl2    lin via gl2      lin via hl2      lin via hl2
+ * shadow_linear     n/a      sh_lin via sl2   sh_lin via sl2   sh_lin via sl2
+ * monitor_linear    n/a            n/a             n/a              ???
+ * perdomain      perdomain      perdomain       perdomain        perdomain
+ * R/O M2P         R/O M2P        R/O M2P           n/a              n/a
+ * R/W M2P         R/W M2P        R/W M2P         R/W M2P          R/W M2P
+ * P2M               n/a            n/a           R/O M2P          R/O M2P
+ *
+ * NB:
+ * update_pagetables(), __update_pagetables(), shadow_mode_enable(),
+ * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable()
+ * all play a part in maintaining these mappings.
+ */
+void __update_pagetables(struct vcpu *v)
+{
+    struct domain *d = v->domain;
+    unsigned long gmfn = pagetable_get_pfn(v->arch.guest_table);
+    unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
+    unsigned long smfn, hl2mfn, old_smfn;
+
+    int max_mode = ( shadow_mode_external(d) ? SHM_external
+                     : shadow_mode_translate(d) ? SHM_translate
+                     : shadow_mode_enabled(d) ? SHM_enable
+                     : 0 );
+
+    ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
+    ASSERT( max_mode );
+
+    /*
+     *  arch.guest_vtable
+     */
+    if ( max_mode & (SHM_enable | SHM_external) )
+    {
+        if ( likely(v->arch.guest_vtable != NULL) )
+            unmap_domain_page(v->arch.guest_vtable);
+        v->arch.guest_vtable = map_domain_page(gmfn);
+    }
+
+    /*
+     *  arch.shadow_table
+     */
+    if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) )
+        smfn = shadow_l2_table(d, gpfn, gmfn);
+    if ( !get_shadow_ref(smfn) )
+        BUG();
+    old_smfn = pagetable_get_pfn(v->arch.shadow_table);
+    v->arch.shadow_table = mk_pagetable(smfn << PAGE_SHIFT);
+    if ( old_smfn )
+        put_shadow_ref(old_smfn);
+
+    SH_VVLOG("__update_pagetables(gmfn=%lx, smfn=%lx)", gmfn, smfn);
+
+    /*
+     * arch.shadow_vtable
+     */
+    if ( max_mode == SHM_external )
+    {
+        if ( v->arch.shadow_vtable )
+            unmap_domain_page(v->arch.shadow_vtable);
+        v->arch.shadow_vtable = map_domain_page(smfn);
+    }
+
+    /*
+     * arch.hl2_vtable
+     */
+
+    // if max_mode == SHM_translate, then the hl2 is already installed
+    // correctly in its smfn, and there's nothing to do.
+    //
+    if ( max_mode == SHM_external )
+    {
+        if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
+            hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
+        if ( v->arch.hl2_vtable )
+            unmap_domain_page(v->arch.hl2_vtable);
+        v->arch.hl2_vtable = map_domain_page(hl2mfn);
+    }
+
+    /*
+     * fixup pointers in monitor table, as necessary
+     */
+    if ( max_mode == SHM_external )
+    {
+        l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
+        l2_pgentry_t old_hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
+        l2_pgentry_t old_sl2e = 
mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
+
+        ASSERT( shadow_mode_translate(d) );
+
+        if ( !get_shadow_ref(hl2mfn) )
+            BUG();
+        mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
+            l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
+        if ( l2e_get_flags(old_hl2e) & _PAGE_PRESENT )
+            put_shadow_ref(l2e_get_pfn(old_hl2e));
+
+        if ( !get_shadow_ref(smfn) )
+            BUG();
+        mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
+            l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
+        if ( l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
+            put_shadow_ref(l2e_get_pfn(old_sl2e));
+
+        // XXX - maybe this can be optimized somewhat??
+        local_flush_tlb();
+    }
+}
+
+
+/************************************************************************/
+/************************************************************************/
+/************************************************************************/
+
+#if SHADOW_DEBUG
+
+// The following is entirely for _check_pagetable()'s benefit.
+// _check_pagetable() wants to know whether a given entry in a
+// shadow page table is supposed to be the shadow of the guest's
+// current entry, or the shadow of the entry held in the snapshot
+// taken above.
+//
+// Here, we mark all currently existing entries as reflecting
+// the snapshot, above.  All other places in xen that update
+// the shadow will keep the shadow in sync with the guest's
+// entries (via l1pte_propagate_from_guest and friends), which clear
+// the SHADOW_REFLECTS_SNAPSHOT bit.
+//
+static void
+mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn)
+{
+    unsigned long smfn;
+    l1_pgentry_t *l1e;
+    l2_pgentry_t *l2e;
+    unsigned i;
+
+    if ( (smfn = __shadow_status(d, gpfn, PGT_l1_shadow)) )
+    {
+        l1e = map_domain_page(smfn);
+        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+            if ( is_guest_l1_slot(i) &&
+                 (l1e_get_flags(l1e[i]) & _PAGE_PRESENT) )
+                l1e_add_flags(l1e[i], SHADOW_REFLECTS_SNAPSHOT);
+        unmap_domain_page(l1e);
+    }
+
+    if ( (smfn = __shadow_status(d, gpfn, PGT_l2_shadow)) )
+    {
+        l2e = map_domain_page(smfn);
+        for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
+            if ( is_guest_l2_slot(0, i) &&
+                 (l2e_get_flags(l2e[i]) & _PAGE_PRESENT) )
+                l2e_add_flags(l2e[i], SHADOW_REFLECTS_SNAPSHOT);
+        unmap_domain_page(l2e);
+    }
+}
+
+// BUG: these are not SMP safe...
+static int sh_l2_present;
+static int sh_l1_present;
+char * sh_check_name;
+int shadow_status_noswap;
+
+#define v2m(_v, _adr) ({                                                     \
+    unsigned long _a  = (unsigned long)(_adr);                               \
+    l2_pgentry_t _pde = shadow_linear_l2_table(_v)[l2_table_offset(_a)];     \
+    unsigned long _pa = -1;                                                  \
+    if ( l2e_get_flags(_pde) & _PAGE_PRESENT )                               \
+    {                                                                        \
+        l1_pgentry_t _pte;                                                   \
+        _pte = shadow_linear_pg_table[l1_linear_offset(_a)];                 \
+        if ( l1e_get_flags(_pte) & _PAGE_PRESENT )                           \
+            _pa = l1e_get_paddr(_pte);                                       \
+    }                                                                        \
+    _pa | (_a & ~PAGE_MASK);                                                 \
+})
+
+#define FAIL(_f, _a...)                                                      \
+    do {                                                                     \
+        printk("XXX %s-FAIL (%d,%d,%d) " _f " at %s(%d)\n",                  \
+               sh_check_name, level, l2_idx, l1_idx, ## _a,                  \
+               __FILE__, __LINE__);                                          \
+        printk("guest_pte=%" PRIpte " eff_guest_pte=%" PRIpte                \
+               " shadow_pte=%" PRIpte " snapshot_pte=%" PRIpte               \
+               " &guest=%p &shadow=%p &snap=%p v2m(&guest)=%p"               \
+               " v2m(&shadow)=%p v2m(&snap)=%p ea=%08x\n",                   \
+               l1e_get_intpte(guest_pte), l1e_get_intpte(eff_guest_pte),     \
+               l1e_get_intpte(shadow_pte), l1e_get_intpte(snapshot_pte),     \
+               p_guest_pte, p_shadow_pte, p_snapshot_pte,                    \
+               (void *)v2m(v, p_guest_pte), (void *)v2m(v, p_shadow_pte),    \
+               (void *)v2m(v, p_snapshot_pte),                               \
+               (l2_idx << L2_PAGETABLE_SHIFT) |                              \
+               (l1_idx << L1_PAGETABLE_SHIFT));                              \
+        errors++;                                                            \
+    } while ( 0 )
+
+static int check_pte(
+    struct vcpu *v,
+    l1_pgentry_t *p_guest_pte,
+    l1_pgentry_t *p_shadow_pte,
+    l1_pgentry_t *p_snapshot_pte,
+    int level, int l2_idx, int l1_idx)
+{
+    struct domain *d = v->domain;
+    l1_pgentry_t guest_pte = *p_guest_pte;
+    l1_pgentry_t shadow_pte = *p_shadow_pte;
+    l1_pgentry_t snapshot_pte = p_snapshot_pte ? *p_snapshot_pte : l1e_empty();
+    l1_pgentry_t eff_guest_pte;
+    unsigned long mask, eff_guest_pfn, eff_guest_mfn, shadow_mfn;
+    int errors = 0, guest_writable;
+    int page_table_page;
+
+    if ( (l1e_get_intpte(shadow_pte) == 0) ||
+         (l1e_get_intpte(shadow_pte) == 0xdeadface) ||
+         (l1e_get_intpte(shadow_pte) == 0x00000E00) )
+        return errors;  /* always safe */
+
+    if ( !(l1e_get_flags(shadow_pte) & _PAGE_PRESENT) )
+        FAIL("Non zero not present shadow_pte");
+
+    if ( level == 2 ) sh_l2_present++;
+    if ( level == 1 ) sh_l1_present++;
+
+    if ( (l1e_get_flags(shadow_pte) & SHADOW_REFLECTS_SNAPSHOT) && 
p_snapshot_pte )
+        eff_guest_pte = snapshot_pte;
+    else
+        eff_guest_pte = guest_pte;
+
+    if ( !(l1e_get_flags(eff_guest_pte) & _PAGE_PRESENT) )
+        FAIL("Guest not present yet shadow is");
+
+    mask = 
~(_PAGE_GLOBAL|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|_PAGE_AVAIL|PAGE_MASK);
+
+    if ( ((l1e_get_intpte(shadow_pte) & mask) != 
(l1e_get_intpte(eff_guest_pte) & mask)) )
+        FAIL("Corrupt?");
+
+    if ( (level == 1) &&
+         (l1e_get_flags(shadow_pte) & _PAGE_DIRTY) &&
+         !(l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY) )
+        FAIL("Dirty coherence");
+
+    if ( (l1e_get_flags(shadow_pte) & _PAGE_ACCESSED) &&
+         !(l1e_get_flags(eff_guest_pte) & _PAGE_ACCESSED) )
+        FAIL("Accessed coherence");
+
+    if ( l1e_get_flags(shadow_pte) & _PAGE_GLOBAL )
+        FAIL("global bit set in shadow");
+
+    eff_guest_pfn = l1e_get_pfn(eff_guest_pte);
+    eff_guest_mfn = __gpfn_to_mfn(d, eff_guest_pfn);
+    shadow_mfn = l1e_get_pfn(shadow_pte);
+
+    if ( !VALID_MFN(eff_guest_mfn) && !shadow_mode_refcounts(d) )
+        FAIL("%s: invalid eff_guest_pfn=%lx eff_guest_pte=%" PRIpte "\n",
+             __func__, eff_guest_pfn, l1e_get_intpte(eff_guest_pte));
+
+    page_table_page = mfn_is_page_table(eff_guest_mfn);
+
+    guest_writable =
+        (l1e_get_flags(eff_guest_pte) & _PAGE_RW) ||
+        (VM_ASSIST(d, VMASST_TYPE_writable_pagetables) && (level == 1) && 
mfn_out_of_sync(eff_guest_mfn));
+
+    if ( (l1e_get_flags(shadow_pte) & _PAGE_RW ) && !guest_writable )
+    {
+        printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08x 
page_table_page=%d\n",
+               eff_guest_pfn, eff_guest_mfn, shadow_mfn,
+               frame_table[eff_guest_mfn].u.inuse.type_info,
+               page_table_page);
+        FAIL("RW coherence");
+    }
+
+    if ( (level == 1) &&
+         (l1e_get_flags(shadow_pte) & _PAGE_RW ) &&
+         !(guest_writable && (l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY)) )
+    {
+        printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08x 
page_table_page=%d\n",
+               eff_guest_pfn, eff_guest_mfn, shadow_mfn,
+               frame_table[eff_guest_mfn].u.inuse.type_info,
+               page_table_page);
+        FAIL("RW2 coherence");
+    }
+ 
+    if ( eff_guest_mfn == shadow_mfn )
+    {
+        if ( level > 1 )
+            FAIL("Linear map ???");    /* XXX this will fail on BSD */
+    }
+    else
+    {
+        if ( level < 2 )
+            FAIL("Shadow in L1 entry?");
+
+        if ( level == 2 )
+        {
+            if ( __shadow_status(d, eff_guest_pfn, PGT_l1_shadow) != 
shadow_mfn )
+                FAIL("shadow_mfn problem eff_guest_pfn=%lx shadow_mfn=%lx", 
eff_guest_pfn,
+                     __shadow_status(d, eff_guest_pfn, PGT_l1_shadow));
+        }
+        else
+            BUG(); // XXX -- not handled yet.
+    }
+
+    return errors;
+}
+#undef FAIL
+#undef v2m
+
+static int check_l1_table(
+    struct vcpu *v, unsigned long gpfn,
+    unsigned long gmfn, unsigned long smfn, unsigned l2_idx)
+{
+    struct domain *d = v->domain;
+    int i;
+    unsigned long snapshot_mfn;
+    l1_pgentry_t *p_guest, *p_shadow, *p_snapshot = NULL;
+    int errors = 0;
+
+    if ( page_out_of_sync(pfn_to_page(gmfn)) )
+    {
+        snapshot_mfn = __shadow_status(d, gpfn, PGT_snapshot);
+        ASSERT(snapshot_mfn);
+        p_snapshot = map_domain_page(snapshot_mfn);
+    }
+
+    p_guest  = map_domain_page(gmfn);
+    p_shadow = map_domain_page(smfn);
+
+    for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+        errors += check_pte(v, p_guest+i, p_shadow+i,
+                            p_snapshot ? p_snapshot+i : NULL,
+                            1, l2_idx, i);
+ 
+    unmap_domain_page(p_shadow);
+    unmap_domain_page(p_guest);
+    if ( p_snapshot )
+        unmap_domain_page(p_snapshot);
+
+    return errors;
+}
+
+#define FAILPT(_f, _a...)                                         \
+    do {                                                          \
+        printk("XXX FAIL %s-PT " _f "\n", sh_check_name, ## _a ); \
+        errors++;                                                 \
+    } while ( 0 )
+
+int check_l2_table(
+    struct vcpu *v, unsigned long gmfn, unsigned long smfn, int oos_pdes)
+{
+    struct domain *d = v->domain;
+    l2_pgentry_t *gpl2e = (l2_pgentry_t *)map_domain_page(gmfn);
+    l2_pgentry_t *spl2e = (l2_pgentry_t *)map_domain_page(smfn);
+    l2_pgentry_t match;
+    int i;
+    int errors = 0;
+    int limit;
+
+    if ( !oos_pdes && (page_get_owner(pfn_to_page(gmfn)) != d) )
+        FAILPT("domain doesn't own page");
+    if ( oos_pdes && (page_get_owner(pfn_to_page(gmfn)) != NULL) )
+        FAILPT("bogus owner for snapshot page");
+    if ( page_get_owner(pfn_to_page(smfn)) != NULL )
+        FAILPT("shadow page mfn=0x%lx is owned by someone, domid=%d",
+               smfn, page_get_owner(pfn_to_page(smfn))->domain_id);
+
+#if 0
+    if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
+                &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
+                ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
+                 DOMAIN_ENTRIES_PER_L2_PAGETABLE) * sizeof(l2_pgentry_t)) )
+    {
+        for ( i = DOMAIN_ENTRIES_PER_L2_PAGETABLE; 
+              i < (SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT);
+              i++ )
+            printk("+++ (%d) %lx %lx\n",i,
+                   l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]));
+        FAILPT("hypervisor entries inconsistent");
+    }
+
+    if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) != 
+          l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
+        FAILPT("hypervisor linear map inconsistent");
+#endif
+
+    match = l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
+    if ( !shadow_mode_external(d) &&
+         l2e_has_changed(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT],
+                         match, PAGE_FLAG_MASK))
+    {
+        FAILPT("hypervisor shadow linear map inconsistent %" PRIpte " %" 
PRIpte,
+               l2e_get_intpte(spl2e[SH_LINEAR_PT_VIRT_START >>
+                                   L2_PAGETABLE_SHIFT]),
+               l2e_get_intpte(match));
+    }
+
+    match = l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR);
+    if ( !shadow_mode_external(d) &&
+         l2e_has_changed(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT],
+                         match, PAGE_FLAG_MASK))
+    {
+        FAILPT("hypervisor per-domain map inconsistent saw %" PRIpte ", 
expected (va=%p) %" PRIpte,
+               l2e_get_intpte(spl2e[PERDOMAIN_VIRT_START >> 
L2_PAGETABLE_SHIFT]),
+               d->arch.mm_perdomain_pt,
+               l2e_get_intpte(match));
+    }
+
+#ifdef __i386__
+    if ( shadow_mode_external(d) )
+        limit = L2_PAGETABLE_ENTRIES;
+    else
+        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
+#else
+    limit = 0; /* XXX x86/64 XXX */
+#endif
+
+    /* Check the whole L2. */
+    for ( i = 0; i < limit; i++ )
+        errors += check_pte(v,
+                            (l1_pgentry_t*)(&gpl2e[i]), /* Hmm, dirty ... */
+                            (l1_pgentry_t*)(&spl2e[i]),
+                            NULL,
+                            2, i, 0);
+
+    unmap_domain_page(spl2e);
+    unmap_domain_page(gpl2e);
+
+#if 1
+    if ( errors )
+        printk("check_l2_table returning %d errors\n", errors);
+#endif
+
+    return errors;
+}
+#undef FAILPT
+
+int _check_pagetable(struct vcpu *v, char *s)
+{
+    struct domain *d = v->domain;
+    pagetable_t pt = v->arch.guest_table;
+    unsigned long gptbase = pagetable_get_paddr(pt);
+    unsigned long ptbase_pfn, smfn;
+    unsigned long i;
+    l2_pgentry_t *gpl2e, *spl2e;
+    unsigned long ptbase_mfn = 0;
+    int errors = 0, limit, oos_pdes = 0;
+
+    //_audit_domain(d, AUDIT_QUIET);
+    shadow_lock(d);
+
+    sh_check_name = s;
+    //SH_VVLOG("%s-PT Audit", s);
+    sh_l2_present = sh_l1_present = 0;
+    perfc_incrc(check_pagetable);
+
+    ptbase_mfn = gptbase >> PAGE_SHIFT;
+    ptbase_pfn = __mfn_to_gpfn(d, ptbase_mfn);
+
+    if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) )
+    {
+        printk("%s-PT %lx not shadowed\n", s, gptbase);
+        goto out;
+    }
+    if ( page_out_of_sync(pfn_to_page(ptbase_mfn)) )
+    {
+        ptbase_mfn = __shadow_status(d, ptbase_pfn, PGT_snapshot);
+        oos_pdes = 1;
+        ASSERT(ptbase_mfn);
+    }
+ 
+    errors += check_l2_table(v, ptbase_mfn, smfn, oos_pdes);
+
+    gpl2e = (l2_pgentry_t *) map_domain_page(ptbase_mfn);
+    spl2e = (l2_pgentry_t *) map_domain_page(smfn);
+
+    /* Go back and recurse. */
+#ifdef __i386__
+    if ( shadow_mode_external(d) )
+        limit = L2_PAGETABLE_ENTRIES;
+    else
+        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
+#else
+    limit = 0; /* XXX x86/64 XXX */
+#endif
+
+    for ( i = 0; i < limit; i++ )
+    {
+        unsigned long gl1pfn = l2e_get_pfn(gpl2e[i]);
+        unsigned long gl1mfn = __gpfn_to_mfn(d, gl1pfn);
+        unsigned long sl1mfn = l2e_get_pfn(spl2e[i]);
+
+        if ( l2e_get_intpte(spl2e[i]) != 0 )  /* FIXME: check flags? */
+        {
+            errors += check_l1_table(v, gl1pfn, gl1mfn, sl1mfn, i);
+        }
+    }
+
+    unmap_domain_page(spl2e);
+    unmap_domain_page(gpl2e);
+
+#if 0
+    SH_VVLOG("PT verified : l2_present = %d, l1_present = %d",
+             sh_l2_present, sh_l1_present);
+#endif
+
+ out:
+    if ( errors )
+        BUG();
+
+    shadow_unlock(d);
+
+    return errors;
+}
+
+int _check_all_pagetables(struct vcpu *v, char *s)
+{
+    struct domain *d = v->domain;
+    int i;
+    struct shadow_status *a;
+    unsigned long gmfn;
+    int errors = 0;
+
+    shadow_status_noswap = 1;
+
+    sh_check_name = s;
+    SH_VVLOG("%s-PT Audit domid=%d", s, d->domain_id);
+    sh_l2_present = sh_l1_present = 0;
+    perfc_incrc(check_all_pagetables);
+
+    for (i = 0; i < shadow_ht_buckets; i++)
+    {
+        a = &d->arch.shadow_ht[i];
+        while ( a && a->gpfn_and_flags )
+        {
+            gmfn = __gpfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
+
+            switch ( a->gpfn_and_flags & PGT_type_mask )
+            {
+            case PGT_l1_shadow:
+                errors += check_l1_table(v, a->gpfn_and_flags & PGT_mfn_mask,
+                                         gmfn, a->smfn, 0);
+                break;
+            case PGT_l2_shadow:
+                errors += check_l2_table(v, gmfn, a->smfn,
+                                         page_out_of_sync(pfn_to_page(gmfn)));
+                break;
+            case PGT_l3_shadow:
+            case PGT_l4_shadow:
+            case PGT_hl2_shadow:
+                BUG(); // XXX - ought to fix this...
+                break;
+            case PGT_snapshot:
+            case PGT_writable_pred:
+                break;
+            default:
+                errors++;
+                printk("unexpected shadow type %lx, gpfn=%lx, "
+                       "gmfn=%lx smfn=%lx\n",
+                       a->gpfn_and_flags & PGT_type_mask,
+                       a->gpfn_and_flags & PGT_mfn_mask,
+                       gmfn, a->smfn);
+                BUG();
+            }
+            a = a->next;
+        }
+    }
+
+    shadow_status_noswap = 0;
+
+    if ( errors )
+        BUG();
+
+    return errors;
+}
+
+#endif // SHADOW_DEBUG
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] Check in files I missed from shadow64 checkin., Xen patchbot -unstable <=