WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH] x86/mm/p2m: break into common, pt-implementation and

To: <xen-devel@xxxxxxxxxxxxxxxxxxx>
Subject: [Xen-devel] [PATCH] x86/mm/p2m: break into common, pt-implementation and pod parts
From: Tim Deegan <Tim.Deegan@xxxxxxxxxx>
Date: Fri, 6 May 2011 11:34:35 +0100
Delivery-date: Fri, 06 May 2011 03:35:43 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
User-agent: Mercurial-patchbomb/1.6.4
# HG changeset patch
# User Tim Deegan <Tim.Deegan@xxxxxxxxxx>
# Date 1304676935 -3600
# Node ID 26c4beb6b520733883eb1fb2aac8701de9188e08
# Parent  4b0692880dfa557d4e1537c7a58c412c1286a416
x86/mm/p2m: break into common, pt-implementation and pod parts.

Start to make a clearer distinction between generic p2m functions and
the implementation of the datastructure as an x86 pagetable.
Also move the EPT datastructure implementation into x86/mm/ to match,
and split the PoD admin code into its own file.

This is just code motion, except for splitting the p2m_initialise
function into a pt-specific part and a common part.

Signed-off-by: Tim Deegan <Tim.Deegan@xxxxxxxxxx>

diff -r 4b0692880dfa -r 26c4beb6b520 xen/arch/x86/mm/Makefile
--- a/xen/arch/x86/mm/Makefile  Thu May 05 17:40:34 2011 +0100
+++ b/xen/arch/x86/mm/Makefile  Fri May 06 11:15:35 2011 +0100
@@ -2,7 +2,7 @@ subdir-y += shadow
 subdir-y += hap
 
 obj-y += paging.o
-obj-y += p2m.o
+obj-y += p2m.o p2m-pt.o p2m-ept.o p2m-pod.o
 obj-y += guest_walk_2.o
 obj-y += guest_walk_3.o
 obj-$(x86_64) += guest_walk_4.o
diff -r 4b0692880dfa -r 26c4beb6b520 xen/arch/x86/mm/hap/Makefile
--- a/xen/arch/x86/mm/hap/Makefile      Thu May 05 17:40:34 2011 +0100
+++ b/xen/arch/x86/mm/hap/Makefile      Fri May 06 11:15:35 2011 +0100
@@ -2,7 +2,6 @@ obj-y += hap.o
 obj-y += guest_walk_2level.o
 obj-y += guest_walk_3level.o
 obj-$(x86_64) += guest_walk_4level.o
-obj-y += p2m-ept.o
 obj-y += nested_hap.o
 
 guest_walk_%level.o: guest_walk.c Makefile
diff -r 4b0692880dfa -r 26c4beb6b520 xen/arch/x86/mm/hap/p2m-ept.c
--- a/xen/arch/x86/mm/hap/p2m-ept.c     Thu May 05 17:40:34 2011 +0100
+++ /dev/null   Thu Jan 01 00:00:00 1970 +0000
@@ -1,911 +0,0 @@
-/*
- * ept-p2m.c: use the EPT page table as p2m
- * Copyright (c) 2007, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- */
-
-#include <xen/config.h>
-#include <xen/domain_page.h>
-#include <xen/sched.h>
-#include <asm/current.h>
-#include <asm/paging.h>
-#include <asm/types.h>
-#include <asm/domain.h>
-#include <asm/p2m.h>
-#include <asm/hvm/vmx/vmx.h>
-#include <asm/hvm/vmx/vmcs.h>
-#include <xen/iommu.h>
-#include <asm/mtrr.h>
-#include <asm/hvm/cacheattr.h>
-#include <xen/keyhandler.h>
-#include <xen/softirq.h>
-
-#define atomic_read_ept_entry(__pepte)                              \
-    ( (ept_entry_t) { .epte = atomic_read64(&(__pepte)->epte) } )
-#define atomic_write_ept_entry(__pepte, __epte)                     \
-    atomic_write64(&(__pepte)->epte, (__epte).epte)
-
-#define is_epte_present(ept_entry)      ((ept_entry)->epte & 0x7)
-#define is_epte_superpage(ept_entry)    ((ept_entry)->sp)
-
-/* Non-ept "lock-and-check" wrapper */
-static int ept_pod_check_and_populate(struct p2m_domain *p2m, unsigned long 
gfn,
-                                      ept_entry_t *entry, int order,
-                                      p2m_query_t q)
-{
-    /* Only take the lock if we don't already have it.  Otherwise it
-     * wouldn't be safe to do p2m lookups with the p2m lock held */
-    int do_locking = !p2m_locked_by_me(p2m);
-    int r;
-
-    if ( do_locking )
-        p2m_lock(p2m);
-
-    /* Check to make sure this is still PoD */
-    if ( entry->sa_p2mt != p2m_populate_on_demand )
-    {
-        if ( do_locking )
-            p2m_unlock(p2m);
-        return 0;
-    }
-
-    r = p2m_pod_demand_populate(p2m, gfn, order, q);
-
-    if ( do_locking )
-        p2m_unlock(p2m);
-
-    return r;
-}
-
-static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type, 
p2m_access_t access)
-{
-    /* First apply type permissions */
-    switch(type)
-    {
-        case p2m_invalid:
-        case p2m_mmio_dm:
-        case p2m_populate_on_demand:
-        case p2m_ram_paging_out:
-        case p2m_ram_paged:
-        case p2m_ram_paging_in:
-        case p2m_ram_paging_in_start:
-        default:
-            entry->r = entry->w = entry->x = 0;
-            break;
-        case p2m_ram_rw:
-            entry->r = entry->w = entry->x = 1;
-            break;
-        case p2m_mmio_direct:
-            entry->r = entry->x = 1;
-            entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
-                                                    entry->mfn);
-            break;
-        case p2m_ram_logdirty:
-        case p2m_ram_ro:
-        case p2m_ram_shared:
-            entry->r = entry->x = 1;
-            entry->w = 0;
-            break;
-        case p2m_grant_map_rw:
-            entry->r = entry->w = 1;
-            entry->x = 0;
-            break;
-        case p2m_grant_map_ro:
-            entry->r = 1;
-            entry->w = entry->x = 0;
-            break;
-    }
-
-
-    /* Then restrict with access permissions */
-    switch (access) 
-    {
-        case p2m_access_n:
-            entry->r = entry->w = entry->x = 0;
-            break;
-        case p2m_access_r:
-            entry->w = entry->x = 0;
-            break;
-        case p2m_access_w:
-            entry->r = entry->x = 0;
-            break;
-        case p2m_access_x:
-            entry->r = entry->w = 0;
-            break;
-        case p2m_access_rx:
-        case p2m_access_rx2rw:
-            entry->w = 0;
-            break;
-        case p2m_access_wx:
-            entry->r = 0;
-            break;
-        case p2m_access_rw:
-            entry->x = 0;
-            break;           
-        case p2m_access_rwx:
-            break;
-    }
-    
-}
-
-#define GUEST_TABLE_MAP_FAILED  0
-#define GUEST_TABLE_NORMAL_PAGE 1
-#define GUEST_TABLE_SUPER_PAGE  2
-#define GUEST_TABLE_POD_PAGE    3
-
-/* Fill in middle levels of ept table */
-static int ept_set_middle_entry(struct p2m_domain *p2m, ept_entry_t *ept_entry)
-{
-    struct page_info *pg;
-
-    pg = p2m_alloc_ptp(p2m, 0);
-    if ( pg == NULL )
-        return 0;
-
-    ept_entry->epte = 0;
-    ept_entry->mfn = page_to_mfn(pg);
-    ept_entry->access = p2m->default_access;
-
-    ept_entry->r = ept_entry->w = ept_entry->x = 1;
-
-    return 1;
-}
-
-/* free ept sub tree behind an entry */
-void ept_free_entry(struct p2m_domain *p2m, ept_entry_t *ept_entry, int level)
-{
-    /* End if the entry is a leaf entry. */
-    if ( level == 0 || !is_epte_present(ept_entry) ||
-         is_epte_superpage(ept_entry) )
-        return;
-
-    if ( level > 1 )
-    {
-        ept_entry_t *epte = map_domain_page(ept_entry->mfn);
-        for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
-            ept_free_entry(p2m, epte + i, level - 1);
-        unmap_domain_page(epte);
-    }
-    
-    p2m_free_ptp(p2m, mfn_to_page(ept_entry->mfn));
-}
-
-static int ept_split_super_page(struct p2m_domain *p2m, ept_entry_t *ept_entry,
-                                int level, int target)
-{
-    ept_entry_t new_ept, *table;
-    uint64_t trunk;
-    int rv = 1;
-
-    /* End if the entry is a leaf entry or reaches the target level. */
-    if ( level == 0 || level == target )
-        return rv;
-
-    ASSERT(is_epte_superpage(ept_entry));
-
-    if ( !ept_set_middle_entry(p2m, &new_ept) )
-        return 0;
-
-    table = map_domain_page(new_ept.mfn);
-    trunk = 1UL << ((level - 1) * EPT_TABLE_ORDER);
-
-    for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
-    {
-        ept_entry_t *epte = table + i;
-
-        epte->epte = 0;
-        epte->emt = ept_entry->emt;
-        epte->ipat = ept_entry->ipat;
-        epte->sp = (level > 1) ? 1 : 0;
-        epte->access = ept_entry->access;
-        epte->sa_p2mt = ept_entry->sa_p2mt;
-        epte->mfn = ept_entry->mfn + i * trunk;
-        epte->rsvd2_snp = ( iommu_enabled && iommu_snoop ) ? 1 : 0;
-
-        ept_p2m_type_to_flags(epte, epte->sa_p2mt, epte->access);
-
-        if ( (level - 1) == target )
-            continue;
-
-        ASSERT(is_epte_superpage(epte));
-
-        if ( !(rv = ept_split_super_page(p2m, epte, level - 1, target)) )
-            break;
-    }
-
-    unmap_domain_page(table);
-
-    /* Even failed we should install the newly allocated ept page. */
-    *ept_entry = new_ept;
-
-    return rv;
-}
-
-/* Take the currently mapped table, find the corresponding gfn entry,
- * and map the next table, if available.  If the entry is empty
- * and read_only is set, 
- * Return values:
- *  0: Failed to map.  Either read_only was set and the entry was
- *   empty, or allocating a new page failed.
- *  GUEST_TABLE_NORMAL_PAGE: next level mapped normally
- *  GUEST_TABLE_SUPER_PAGE:
- *   The next entry points to a superpage, and caller indicates
- *   that they are going to the superpage level, or are only doing
- *   a read.
- *  GUEST_TABLE_POD:
- *   The next entry is marked populate-on-demand.
- */
-static int ept_next_level(struct p2m_domain *p2m, bool_t read_only,
-                          ept_entry_t **table, unsigned long *gfn_remainder,
-                          int next_level)
-{
-    unsigned long mfn;
-    ept_entry_t *ept_entry, e;
-    u32 shift, index;
-
-    shift = next_level * EPT_TABLE_ORDER;
-
-    index = *gfn_remainder >> shift;
-
-    /* index must be falling into the page */
-    ASSERT(index < EPT_PAGETABLE_ENTRIES);
-
-    ept_entry = (*table) + index;
-
-    /* ept_next_level() is called (sometimes) without a lock.  Read
-     * the entry once, and act on the "cached" entry after that to
-     * avoid races. */
-    e = atomic_read_ept_entry(ept_entry);
-
-    if ( !is_epte_present(&e) )
-    {
-        if ( e.sa_p2mt == p2m_populate_on_demand )
-            return GUEST_TABLE_POD_PAGE;
-
-        if ( read_only )
-            return GUEST_TABLE_MAP_FAILED;
-
-        if ( !ept_set_middle_entry(p2m, ept_entry) )
-            return GUEST_TABLE_MAP_FAILED;
-        else
-            e = atomic_read_ept_entry(ept_entry); /* Refresh */
-    }
-
-    /* The only time sp would be set here is if we had hit a superpage */
-    if ( is_epte_superpage(&e) )
-        return GUEST_TABLE_SUPER_PAGE;
-
-    mfn = e.mfn;
-    unmap_domain_page(*table);
-    *table = map_domain_page(mfn);
-    *gfn_remainder &= (1UL << shift) - 1;
-    return GUEST_TABLE_NORMAL_PAGE;
-}
-
-/*
- * ept_set_entry() computes 'need_modify_vtd_table' for itself,
- * by observing whether any gfn->mfn translations are modified.
- */
-static int
-ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, 
-              unsigned int order, p2m_type_t p2mt, p2m_access_t p2ma)
-{
-    ept_entry_t *table, *ept_entry = NULL;
-    unsigned long gfn_remainder = gfn;
-    unsigned long offset = 0;
-    u32 index;
-    int i, target = order / EPT_TABLE_ORDER;
-    int rv = 0;
-    int ret = 0;
-    bool_t direct_mmio = (p2mt == p2m_mmio_direct);
-    uint8_t ipat = 0;
-    int need_modify_vtd_table = 1;
-    int vtd_pte_present = 0;
-    int needs_sync = 1;
-    struct domain *d = p2m->domain;
-    ept_entry_t old_entry = { .epte = 0 };
-
-    /*
-     * the caller must make sure:
-     * 1. passing valid gfn and mfn at order boundary.
-     * 2. gfn not exceeding guest physical address width.
-     * 3. passing a valid order.
-     */
-    if ( ((gfn | mfn_x(mfn)) & ((1UL << order) - 1)) ||
-         ((u64)gfn >> ((ept_get_wl(d) + 1) * EPT_TABLE_ORDER)) ||
-         (order % EPT_TABLE_ORDER) )
-        return 0;
-
-    ASSERT((target == 2 && hvm_hap_has_1gb(d)) ||
-           (target == 1 && hvm_hap_has_2mb(d)) ||
-           (target == 0));
-
-    table = map_domain_page(ept_get_asr(d));
-
-    ASSERT(table != NULL);
-
-    for ( i = ept_get_wl(d); i > target; i-- )
-    {
-        ret = ept_next_level(p2m, 0, &table, &gfn_remainder, i);
-        if ( !ret )
-            goto out;
-        else if ( ret != GUEST_TABLE_NORMAL_PAGE )
-            break;
-    }
-
-    ASSERT(ret != GUEST_TABLE_POD_PAGE || i != target);
-
-    index = gfn_remainder >> (i * EPT_TABLE_ORDER);
-    offset = gfn_remainder & ((1UL << (i * EPT_TABLE_ORDER)) - 1);
-
-    ept_entry = table + index;
-
-    /* In case VT-d uses same page table, this flag is needed by VT-d */ 
-    vtd_pte_present = is_epte_present(ept_entry) ? 1 : 0;
-
-    /*
-     * If we're here with i > target, we must be at a leaf node, and
-     * we need to break up the superpage.
-     *
-     * If we're here with i == target and i > 0, we need to check to see
-     * if we're replacing a non-leaf entry (i.e., pointing to an N-1 table)
-     * with a leaf entry (a 1GiB or 2MiB page), and handle things 
appropriately.
-     */
-
-    if ( i == target )
-    {
-        /* We reached the target level. */
-        ept_entry_t new_entry = { .epte = 0 };
-
-        /* No need to flush if the old entry wasn't valid */
-        if ( !is_epte_present(ept_entry) )
-            needs_sync = 0;
-
-        /* If we're replacing a non-leaf entry with a leaf entry (1GiB or 
2MiB),
-         * the intermediate tables will be freed below after the ept flush
-         *
-         * Read-then-write is OK because we hold the p2m lock. */
-        old_entry = *ept_entry;
-
-        if ( mfn_valid(mfn_x(mfn)) || direct_mmio || p2m_is_paged(p2mt) ||
-             (p2mt == p2m_ram_paging_in_start) )
-        {
-            /* Construct the new entry, and then write it once */
-            new_entry.emt = epte_get_entry_emt(p2m->domain, gfn, mfn, &ipat,
-                                                direct_mmio);
-
-            new_entry.ipat = ipat;
-            new_entry.sp = order ? 1 : 0;
-            new_entry.sa_p2mt = p2mt;
-            new_entry.access = p2ma;
-            new_entry.rsvd2_snp = (iommu_enabled && iommu_snoop);
-
-            new_entry.mfn = mfn_x(mfn);
-
-            if ( old_entry.mfn == new_entry.mfn )
-                need_modify_vtd_table = 0;
-
-            ept_p2m_type_to_flags(&new_entry, p2mt, p2ma);
-        }
-
-        atomic_write_ept_entry(ept_entry, new_entry);
-    }
-    else
-    {
-        /* We need to split the original page. */
-        ept_entry_t split_ept_entry;
-        ept_entry_t new_entry = { .epte = 0 };
-
-        ASSERT(is_epte_superpage(ept_entry));
-
-        split_ept_entry = atomic_read_ept_entry(ept_entry);
-
-        if ( !ept_split_super_page(p2m, &split_ept_entry, i, target) )
-        {
-            ept_free_entry(p2m, &split_ept_entry, i);
-            goto out;
-        }
-
-        /* now install the newly split ept sub-tree */
-        /* NB: please make sure domian is paused and no in-fly VT-d DMA. */
-        atomic_write_ept_entry(ept_entry, split_ept_entry);
-
-        /* then move to the level we want to make real changes */
-        for ( ; i > target; i-- )
-            ept_next_level(p2m, 0, &table, &gfn_remainder, i);
-
-        ASSERT(i == target);
-
-        index = gfn_remainder >> (i * EPT_TABLE_ORDER);
-        offset = gfn_remainder & ((1UL << (i * EPT_TABLE_ORDER)) - 1);
-
-        ept_entry = table + index;
-
-        new_entry.emt = epte_get_entry_emt(d, gfn, mfn, &ipat, direct_mmio);
-        new_entry.ipat = ipat;
-        new_entry.sp = i ? 1 : 0;
-        new_entry.sa_p2mt = p2mt;
-        new_entry.access = p2ma;
-        new_entry.rsvd2_snp = (iommu_enabled && iommu_snoop);
-
-        /* the caller should take care of the previous page */
-        new_entry.mfn = mfn_x(mfn);
-
-        /* Safe to read-then-write because we hold the p2m lock */
-        if ( ept_entry->mfn == new_entry.mfn )
-             need_modify_vtd_table = 0;
-
-        ept_p2m_type_to_flags(&new_entry, p2mt, p2ma);
-
-        atomic_write_ept_entry(ept_entry, new_entry);
-    }
-
-    /* Track the highest gfn for which we have ever had a valid mapping */
-    if ( mfn_valid(mfn_x(mfn)) &&
-         (gfn + (1UL << order) - 1 > p2m->max_mapped_pfn) )
-        p2m->max_mapped_pfn = gfn + (1UL << order) - 1;
-
-    /* Success */
-    rv = 1;
-
-out:
-    unmap_domain_page(table);
-
-    if ( needs_sync )
-        ept_sync_domain(p2m->domain);
-
-    if ( rv && iommu_enabled && need_iommu(p2m->domain) && 
need_modify_vtd_table )
-    {
-        if ( iommu_hap_pt_share )
-            iommu_pte_flush(d, gfn, (u64*)ept_entry, order, vtd_pte_present);
-        else
-        {
-            if ( p2mt == p2m_ram_rw )
-            {
-                if ( order > 0 )
-                {
-                    for ( i = 0; i < (1 << order); i++ )
-                        iommu_map_page(
-                            p2m->domain, gfn - offset + i, mfn_x(mfn) - offset 
+ i,
-                            IOMMUF_readable | IOMMUF_writable);
-                }
-                else if ( !order )
-                    iommu_map_page(
-                        p2m->domain, gfn, mfn_x(mfn), IOMMUF_readable | 
IOMMUF_writable);
-            }
-            else
-            {
-                if ( order > 0 )
-                {
-                    for ( i = 0; i < (1 << order); i++ )
-                        iommu_unmap_page(p2m->domain, gfn - offset + i);
-                }
-                else if ( !order )
-                    iommu_unmap_page(p2m->domain, gfn);
-            }
-        }
-    }
-
-    /* Release the old intermediate tables, if any.  This has to be the
-       last thing we do, after the ept_sync_domain() and removal
-       from the iommu tables, so as to avoid a potential
-       use-after-free. */
-    if ( is_epte_present(&old_entry) )
-        ept_free_entry(p2m, &old_entry, target);
-
-    return rv;
-}
-
-/* Read ept p2m entries */
-static mfn_t ept_get_entry(struct p2m_domain *p2m,
-                           unsigned long gfn, p2m_type_t *t, p2m_access_t* a,
-                           p2m_query_t q)
-{
-    struct domain *d = p2m->domain;
-    ept_entry_t *table = map_domain_page(ept_get_asr(d));
-    unsigned long gfn_remainder = gfn;
-    ept_entry_t *ept_entry;
-    u32 index;
-    int i;
-    int ret = 0;
-    mfn_t mfn = _mfn(INVALID_MFN);
-
-    *t = p2m_mmio_dm;
-    *a = p2m_access_n;
-
-    /* This pfn is higher than the highest the p2m map currently holds */
-    if ( gfn > p2m->max_mapped_pfn )
-        goto out;
-
-    /* Should check if gfn obeys GAW here. */
-
-    for ( i = ept_get_wl(d); i > 0; i-- )
-    {
-    retry:
-        ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i);
-        if ( !ret )
-            goto out;
-        else if ( ret == GUEST_TABLE_POD_PAGE )
-        {
-            if ( q == p2m_query )
-            {
-                *t = p2m_populate_on_demand;
-                goto out;
-            }
-
-            /* Populate this superpage */
-            ASSERT(i == 1);
-
-            index = gfn_remainder >> ( i * EPT_TABLE_ORDER);
-            ept_entry = table + index;
-
-            if ( !ept_pod_check_and_populate(p2m, gfn,
-                                             ept_entry, 9, q) )
-                goto retry;
-            else
-                goto out;
-        }
-        else if ( ret == GUEST_TABLE_SUPER_PAGE )
-            break;
-    }
-
-    index = gfn_remainder >> (i * EPT_TABLE_ORDER);
-    ept_entry = table + index;
-
-    if ( ept_entry->sa_p2mt == p2m_populate_on_demand )
-    {
-        if ( q == p2m_query )
-        {
-            *t = p2m_populate_on_demand;
-            goto out;
-        }
-
-        ASSERT(i == 0);
-        
-        if ( ept_pod_check_and_populate(p2m, gfn,
-                                        ept_entry, 0, q) )
-            goto out;
-    }
-
-    /* Need to check for all-zeroes because typecode 0 is p2m_ram and an
-     * entirely empty entry shouldn't have RAM type. */
-    if ( ept_entry->epte != 0 && ept_entry->sa_p2mt != p2m_invalid )
-    {
-        *t = ept_entry->sa_p2mt;
-        *a = ept_entry->access;
-
-        mfn = _mfn(ept_entry->mfn);
-        if ( i )
-        {
-            /* 
-             * We may meet super pages, and to split into 4k pages
-             * to emulate p2m table
-             */
-            unsigned long split_mfn = mfn_x(mfn) +
-                (gfn_remainder &
-                 ((1 << (i * EPT_TABLE_ORDER)) - 1));
-            mfn = _mfn(split_mfn);
-        }
-    }
-
-out:
-    unmap_domain_page(table);
-    return mfn;
-}
-
-/* WARNING: Only caller doesn't care about PoD pages.  So this function will
- * always return 0 for PoD pages, not populate them.  If that becomes 
necessary,
- * pass a p2m_query_t type along to distinguish. */
-static ept_entry_t ept_get_entry_content(struct p2m_domain *p2m,
-    unsigned long gfn, int *level)
-{
-    ept_entry_t *table = map_domain_page(ept_get_asr(p2m->domain));
-    unsigned long gfn_remainder = gfn;
-    ept_entry_t *ept_entry;
-    ept_entry_t content = { .epte = 0 };
-    u32 index;
-    int i;
-    int ret=0;
-
-    /* This pfn is higher than the highest the p2m map currently holds */
-    if ( gfn > p2m->max_mapped_pfn )
-        goto out;
-
-    for ( i = ept_get_wl(p2m->domain); i > 0; i-- )
-    {
-        ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i);
-        if ( !ret || ret == GUEST_TABLE_POD_PAGE )
-            goto out;
-        else if ( ret == GUEST_TABLE_SUPER_PAGE )
-            break;
-    }
-
-    index = gfn_remainder >> (i * EPT_TABLE_ORDER);
-    ept_entry = table + index;
-    content = *ept_entry;
-    *level = i;
-
- out:
-    unmap_domain_page(table);
-    return content;
-}
-
-void ept_walk_table(struct domain *d, unsigned long gfn)
-{
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-    ept_entry_t *table = map_domain_page(ept_get_asr(d));
-    unsigned long gfn_remainder = gfn;
-
-    int i;
-
-    gdprintk(XENLOG_ERR, "Walking EPT tables for domain %d gfn %lx\n",
-           d->domain_id, gfn);
-
-    /* This pfn is higher than the highest the p2m map currently holds */
-    if ( gfn > p2m->max_mapped_pfn )
-    {
-        gdprintk(XENLOG_ERR, " gfn exceeds max_mapped_pfn %lx\n",
-               p2m->max_mapped_pfn);
-        goto out;
-    }
-
-    for ( i = ept_get_wl(d); i >= 0; i-- )
-    {
-        ept_entry_t *ept_entry, *next;
-        u32 index;
-
-        /* Stolen from ept_next_level */
-        index = gfn_remainder >> (i*EPT_TABLE_ORDER);
-        ept_entry = table + index;
-
-        gdprintk(XENLOG_ERR, " epte %"PRIx64"\n", ept_entry->epte);
-
-        if ( (i == 0) || !is_epte_present(ept_entry) ||
-             is_epte_superpage(ept_entry) )
-            goto out;
-        else
-        {
-            gfn_remainder &= (1UL << (i*EPT_TABLE_ORDER)) - 1;
-
-            next = map_domain_page(ept_entry->mfn);
-
-            unmap_domain_page(table);
-
-            table = next;
-        }
-    }
-
-out:
-    unmap_domain_page(table);
-    return;
-}
-
-static mfn_t ept_get_entry_current(struct p2m_domain *p2m,
-                                   unsigned long gfn, p2m_type_t *t, 
p2m_access_t *a,
-                                   p2m_query_t q)
-{
-    return ept_get_entry(p2m, gfn, t, a, q);
-}
-
-/*
- * To test if the new emt type is the same with old,
- * return 1 to not to reset ept entry.
- */
-static int need_modify_ept_entry(struct p2m_domain *p2m, unsigned long gfn,
-                                 mfn_t mfn, uint8_t o_ipat, uint8_t o_emt,
-                                 p2m_type_t p2mt)
-{
-    uint8_t ipat;
-    uint8_t emt;
-    bool_t direct_mmio = (p2mt == p2m_mmio_direct);
-
-    emt = epte_get_entry_emt(p2m->domain, gfn, mfn, &ipat, direct_mmio);
-
-    if ( (emt == o_emt) && (ipat == o_ipat) )
-        return 0;
-
-    return 1;
-}
-
-void ept_change_entry_emt_with_range(struct domain *d,
-                                     unsigned long start_gfn,
-                                     unsigned long end_gfn)
-{
-    unsigned long gfn;
-    ept_entry_t e;
-    mfn_t mfn;
-    int order = 0;
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-
-    p2m_lock(p2m);
-    for ( gfn = start_gfn; gfn <= end_gfn; gfn++ )
-    {
-        int level = 0;
-        uint64_t trunk = 0;
-
-        e = ept_get_entry_content(p2m, gfn, &level);
-        if ( !p2m_has_emt(e.sa_p2mt) )
-            continue;
-
-        order = 0;
-        mfn = _mfn(e.mfn);
-
-        if ( is_epte_superpage(&e) )
-        {
-            while ( level )
-            {
-                trunk = (1UL << (level * EPT_TABLE_ORDER)) - 1;
-                if ( !(gfn & trunk) && (gfn + trunk <= end_gfn) )
-                {
-                    /* gfn assigned with 2M or 1G, and the end covers more than
-                     * the super page areas.
-                     * Set emt for super page.
-                     */
-                    order = level * EPT_TABLE_ORDER;
-                    if ( need_modify_ept_entry(p2m, gfn, mfn, 
-                          e.ipat, e.emt, e.sa_p2mt) )
-                        ept_set_entry(p2m, gfn, mfn, order, e.sa_p2mt, 
e.access);
-                    gfn += trunk;
-                    break;
-                }
-                level--;
-             }
-        }
-        else /* gfn assigned with 4k */
-        {
-            if ( need_modify_ept_entry(p2m, gfn, mfn, e.ipat, e.emt, 
e.sa_p2mt) )
-                ept_set_entry(p2m, gfn, mfn, order, e.sa_p2mt, e.access);
-        }
-    }
-    p2m_unlock(p2m);
-}
-
-/*
- * Walk the whole p2m table, changing any entries of the old type
- * to the new type.  This is used in hardware-assisted paging to
- * quickly enable or diable log-dirty tracking
- */
-static void ept_change_entry_type_page(mfn_t ept_page_mfn, int ept_page_level,
-                                       p2m_type_t ot, p2m_type_t nt)
-{
-    ept_entry_t e, *epte = map_domain_page(mfn_x(ept_page_mfn));
-
-    for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
-    {
-        if ( !is_epte_present(epte + i) )
-            continue;
-
-        if ( (ept_page_level > 0) && !is_epte_superpage(epte + i) )
-            ept_change_entry_type_page(_mfn(epte[i].mfn),
-                                       ept_page_level - 1, ot, nt);
-        else
-        {
-            e = atomic_read_ept_entry(&epte[i]);
-            if ( e.sa_p2mt != ot )
-                continue;
-
-            e.sa_p2mt = nt;
-            ept_p2m_type_to_flags(&e, nt, e.access);
-            atomic_write_ept_entry(&epte[i], e);
-        }
-    }
-
-    unmap_domain_page(epte);
-}
-
-static void ept_change_entry_type_global(struct p2m_domain *p2m,
-                                         p2m_type_t ot, p2m_type_t nt)
-{
-    struct domain *d = p2m->domain;
-    if ( ept_get_asr(d) == 0 )
-        return;
-
-    BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
-    BUG_ON(ot != nt && (ot == p2m_mmio_direct || nt == p2m_mmio_direct));
-
-    ept_change_entry_type_page(_mfn(ept_get_asr(d)), ept_get_wl(d), ot, nt);
-
-    ept_sync_domain(d);
-}
-
-void ept_p2m_init(struct domain *d)
-{
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-    p2m->set_entry = ept_set_entry;
-    p2m->get_entry = ept_get_entry;
-    p2m->get_entry_current = ept_get_entry_current;
-    p2m->change_entry_type_global = ept_change_entry_type_global;
-}
-
-static void ept_dump_p2m_table(unsigned char key)
-{
-    struct domain *d;
-    ept_entry_t *table, *ept_entry;
-    mfn_t mfn;
-    int order;
-    int i;
-    int is_pod;
-    int ret = 0;
-    unsigned long index;
-    unsigned long gfn, gfn_remainder;
-    unsigned long record_counter = 0;
-    struct p2m_domain *p2m;
-
-    for_each_domain(d)
-    {
-        if ( !hap_enabled(d) )
-            continue;
-
-        p2m = p2m_get_hostp2m(d);
-        printk("\ndomain%d EPT p2m table: \n", d->domain_id);
-
-        for ( gfn = 0; gfn <= p2m->max_mapped_pfn; gfn += (1 << order) )
-        {
-            gfn_remainder = gfn;
-            mfn = _mfn(INVALID_MFN);
-            table = map_domain_page(ept_get_asr(d));
-
-            for ( i = ept_get_wl(d); i > 0; i-- )
-            {
-                ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i);
-                if ( ret != GUEST_TABLE_NORMAL_PAGE )
-                    break;
-            }
-
-            order = i * EPT_TABLE_ORDER;
-
-            if ( ret == GUEST_TABLE_MAP_FAILED )
-                goto out;
-
-            index = gfn_remainder >> order;
-            ept_entry = table + index;
-            if ( ept_entry->sa_p2mt != p2m_invalid )
-            {
-                ( ept_entry->sa_p2mt == p2m_populate_on_demand ) ? 
-                ( mfn = _mfn(INVALID_MFN), is_pod = 1 ) :
-                ( mfn = _mfn(ept_entry->mfn), is_pod = 0 );
-
-                printk("gfn: %-16lx  mfn: %-16lx  order: %2d  is_pod: %d\n",
-                       gfn, mfn_x(mfn), order, is_pod);
-
-                if ( !(record_counter++ % 100) )
-                    process_pending_softirqs();
-            }
-out:
-            unmap_domain_page(table);
-        }
-    }
-}
-
-static struct keyhandler ept_p2m_table = {
-    .diagnostic = 0,
-    .u.fn = ept_dump_p2m_table,
-    .desc = "dump ept p2m table"
-};
-
-void setup_ept_dump(void)
-{
-    register_keyhandler('D', &ept_p2m_table);
-}
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff -r 4b0692880dfa -r 26c4beb6b520 xen/arch/x86/mm/p2m-ept.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/mm/p2m-ept.c Fri May 06 11:15:35 2011 +0100
@@ -0,0 +1,911 @@
+/*
+ * ept-p2m.c: use the EPT page table as p2m
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <xen/config.h>
+#include <xen/domain_page.h>
+#include <xen/sched.h>
+#include <asm/current.h>
+#include <asm/paging.h>
+#include <asm/types.h>
+#include <asm/domain.h>
+#include <asm/p2m.h>
+#include <asm/hvm/vmx/vmx.h>
+#include <asm/hvm/vmx/vmcs.h>
+#include <xen/iommu.h>
+#include <asm/mtrr.h>
+#include <asm/hvm/cacheattr.h>
+#include <xen/keyhandler.h>
+#include <xen/softirq.h>
+
+#define atomic_read_ept_entry(__pepte)                              \
+    ( (ept_entry_t) { .epte = atomic_read64(&(__pepte)->epte) } )
+#define atomic_write_ept_entry(__pepte, __epte)                     \
+    atomic_write64(&(__pepte)->epte, (__epte).epte)
+
+#define is_epte_present(ept_entry)      ((ept_entry)->epte & 0x7)
+#define is_epte_superpage(ept_entry)    ((ept_entry)->sp)
+
+/* Non-ept "lock-and-check" wrapper */
+static int ept_pod_check_and_populate(struct p2m_domain *p2m, unsigned long 
gfn,
+                                      ept_entry_t *entry, int order,
+                                      p2m_query_t q)
+{
+    /* Only take the lock if we don't already have it.  Otherwise it
+     * wouldn't be safe to do p2m lookups with the p2m lock held */
+    int do_locking = !p2m_locked_by_me(p2m);
+    int r;
+
+    if ( do_locking )
+        p2m_lock(p2m);
+
+    /* Check to make sure this is still PoD */
+    if ( entry->sa_p2mt != p2m_populate_on_demand )
+    {
+        if ( do_locking )
+            p2m_unlock(p2m);
+        return 0;
+    }
+
+    r = p2m_pod_demand_populate(p2m, gfn, order, q);
+
+    if ( do_locking )
+        p2m_unlock(p2m);
+
+    return r;
+}
+
+static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type, 
p2m_access_t access)
+{
+    /* First apply type permissions */
+    switch(type)
+    {
+        case p2m_invalid:
+        case p2m_mmio_dm:
+        case p2m_populate_on_demand:
+        case p2m_ram_paging_out:
+        case p2m_ram_paged:
+        case p2m_ram_paging_in:
+        case p2m_ram_paging_in_start:
+        default:
+            entry->r = entry->w = entry->x = 0;
+            break;
+        case p2m_ram_rw:
+            entry->r = entry->w = entry->x = 1;
+            break;
+        case p2m_mmio_direct:
+            entry->r = entry->x = 1;
+            entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
+                                                    entry->mfn);
+            break;
+        case p2m_ram_logdirty:
+        case p2m_ram_ro:
+        case p2m_ram_shared:
+            entry->r = entry->x = 1;
+            entry->w = 0;
+            break;
+        case p2m_grant_map_rw:
+            entry->r = entry->w = 1;
+            entry->x = 0;
+            break;
+        case p2m_grant_map_ro:
+            entry->r = 1;
+            entry->w = entry->x = 0;
+            break;
+    }
+
+
+    /* Then restrict with access permissions */
+    switch (access) 
+    {
+        case p2m_access_n:
+            entry->r = entry->w = entry->x = 0;
+            break;
+        case p2m_access_r:
+            entry->w = entry->x = 0;
+            break;
+        case p2m_access_w:
+            entry->r = entry->x = 0;
+            break;
+        case p2m_access_x:
+            entry->r = entry->w = 0;
+            break;
+        case p2m_access_rx:
+        case p2m_access_rx2rw:
+            entry->w = 0;
+            break;
+        case p2m_access_wx:
+            entry->r = 0;
+            break;
+        case p2m_access_rw:
+            entry->x = 0;
+            break;           
+        case p2m_access_rwx:
+            break;
+    }
+    
+}
+
+#define GUEST_TABLE_MAP_FAILED  0
+#define GUEST_TABLE_NORMAL_PAGE 1
+#define GUEST_TABLE_SUPER_PAGE  2
+#define GUEST_TABLE_POD_PAGE    3
+
+/* Fill in middle levels of ept table */
+static int ept_set_middle_entry(struct p2m_domain *p2m, ept_entry_t *ept_entry)
+{
+    struct page_info *pg;
+
+    pg = p2m_alloc_ptp(p2m, 0);
+    if ( pg == NULL )
+        return 0;
+
+    ept_entry->epte = 0;
+    ept_entry->mfn = page_to_mfn(pg);
+    ept_entry->access = p2m->default_access;
+
+    ept_entry->r = ept_entry->w = ept_entry->x = 1;
+
+    return 1;
+}
+
+/* free ept sub tree behind an entry */
+void ept_free_entry(struct p2m_domain *p2m, ept_entry_t *ept_entry, int level)
+{
+    /* End if the entry is a leaf entry. */
+    if ( level == 0 || !is_epte_present(ept_entry) ||
+         is_epte_superpage(ept_entry) )
+        return;
+
+    if ( level > 1 )
+    {
+        ept_entry_t *epte = map_domain_page(ept_entry->mfn);
+        for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
+            ept_free_entry(p2m, epte + i, level - 1);
+        unmap_domain_page(epte);
+    }
+    
+    p2m_free_ptp(p2m, mfn_to_page(ept_entry->mfn));
+}
+
+static int ept_split_super_page(struct p2m_domain *p2m, ept_entry_t *ept_entry,
+                                int level, int target)
+{
+    ept_entry_t new_ept, *table;
+    uint64_t trunk;
+    int rv = 1;
+
+    /* End if the entry is a leaf entry or reaches the target level. */
+    if ( level == 0 || level == target )
+        return rv;
+
+    ASSERT(is_epte_superpage(ept_entry));
+
+    if ( !ept_set_middle_entry(p2m, &new_ept) )
+        return 0;
+
+    table = map_domain_page(new_ept.mfn);
+    trunk = 1UL << ((level - 1) * EPT_TABLE_ORDER);
+
+    for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
+    {
+        ept_entry_t *epte = table + i;
+
+        epte->epte = 0;
+        epte->emt = ept_entry->emt;
+        epte->ipat = ept_entry->ipat;
+        epte->sp = (level > 1) ? 1 : 0;
+        epte->access = ept_entry->access;
+        epte->sa_p2mt = ept_entry->sa_p2mt;
+        epte->mfn = ept_entry->mfn + i * trunk;
+        epte->rsvd2_snp = ( iommu_enabled && iommu_snoop ) ? 1 : 0;
+
+        ept_p2m_type_to_flags(epte, epte->sa_p2mt, epte->access);
+
+        if ( (level - 1) == target )
+            continue;
+
+        ASSERT(is_epte_superpage(epte));
+
+        if ( !(rv = ept_split_super_page(p2m, epte, level - 1, target)) )
+            break;
+    }
+
+    unmap_domain_page(table);
+
+    /* Even failed we should install the newly allocated ept page. */
+    *ept_entry = new_ept;
+
+    return rv;
+}
+
+/* Take the currently mapped table, find the corresponding gfn entry,
+ * and map the next table, if available.  If the entry is empty
+ * and read_only is set, 
+ * Return values:
+ *  0: Failed to map.  Either read_only was set and the entry was
+ *   empty, or allocating a new page failed.
+ *  GUEST_TABLE_NORMAL_PAGE: next level mapped normally
+ *  GUEST_TABLE_SUPER_PAGE:
+ *   The next entry points to a superpage, and caller indicates
+ *   that they are going to the superpage level, or are only doing
+ *   a read.
+ *  GUEST_TABLE_POD:
+ *   The next entry is marked populate-on-demand.
+ */
+static int ept_next_level(struct p2m_domain *p2m, bool_t read_only,
+                          ept_entry_t **table, unsigned long *gfn_remainder,
+                          int next_level)
+{
+    unsigned long mfn;
+    ept_entry_t *ept_entry, e;
+    u32 shift, index;
+
+    shift = next_level * EPT_TABLE_ORDER;
+
+    index = *gfn_remainder >> shift;
+
+    /* index must be falling into the page */
+    ASSERT(index < EPT_PAGETABLE_ENTRIES);
+
+    ept_entry = (*table) + index;
+
+    /* ept_next_level() is called (sometimes) without a lock.  Read
+     * the entry once, and act on the "cached" entry after that to
+     * avoid races. */
+    e = atomic_read_ept_entry(ept_entry);
+
+    if ( !is_epte_present(&e) )
+    {
+        if ( e.sa_p2mt == p2m_populate_on_demand )
+            return GUEST_TABLE_POD_PAGE;
+
+        if ( read_only )
+            return GUEST_TABLE_MAP_FAILED;
+
+        if ( !ept_set_middle_entry(p2m, ept_entry) )
+            return GUEST_TABLE_MAP_FAILED;
+        else
+            e = atomic_read_ept_entry(ept_entry); /* Refresh */
+    }
+
+    /* The only time sp would be set here is if we had hit a superpage */
+    if ( is_epte_superpage(&e) )
+        return GUEST_TABLE_SUPER_PAGE;
+
+    mfn = e.mfn;
+    unmap_domain_page(*table);
+    *table = map_domain_page(mfn);
+    *gfn_remainder &= (1UL << shift) - 1;
+    return GUEST_TABLE_NORMAL_PAGE;
+}
+
+/*
+ * ept_set_entry() computes 'need_modify_vtd_table' for itself,
+ * by observing whether any gfn->mfn translations are modified.
+ */
+static int
+ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, 
+              unsigned int order, p2m_type_t p2mt, p2m_access_t p2ma)
+{
+    ept_entry_t *table, *ept_entry = NULL;
+    unsigned long gfn_remainder = gfn;
+    unsigned long offset = 0;
+    u32 index;
+    int i, target = order / EPT_TABLE_ORDER;
+    int rv = 0;
+    int ret = 0;
+    bool_t direct_mmio = (p2mt == p2m_mmio_direct);
+    uint8_t ipat = 0;
+    int need_modify_vtd_table = 1;
+    int vtd_pte_present = 0;
+    int needs_sync = 1;
+    struct domain *d = p2m->domain;
+    ept_entry_t old_entry = { .epte = 0 };
+
+    /*
+     * the caller must make sure:
+     * 1. passing valid gfn and mfn at order boundary.
+     * 2. gfn not exceeding guest physical address width.
+     * 3. passing a valid order.
+     */
+    if ( ((gfn | mfn_x(mfn)) & ((1UL << order) - 1)) ||
+         ((u64)gfn >> ((ept_get_wl(d) + 1) * EPT_TABLE_ORDER)) ||
+         (order % EPT_TABLE_ORDER) )
+        return 0;
+
+    ASSERT((target == 2 && hvm_hap_has_1gb(d)) ||
+           (target == 1 && hvm_hap_has_2mb(d)) ||
+           (target == 0));
+
+    table = map_domain_page(ept_get_asr(d));
+
+    ASSERT(table != NULL);
+
+    for ( i = ept_get_wl(d); i > target; i-- )
+    {
+        ret = ept_next_level(p2m, 0, &table, &gfn_remainder, i);
+        if ( !ret )
+            goto out;
+        else if ( ret != GUEST_TABLE_NORMAL_PAGE )
+            break;
+    }
+
+    ASSERT(ret != GUEST_TABLE_POD_PAGE || i != target);
+
+    index = gfn_remainder >> (i * EPT_TABLE_ORDER);
+    offset = gfn_remainder & ((1UL << (i * EPT_TABLE_ORDER)) - 1);
+
+    ept_entry = table + index;
+
+    /* In case VT-d uses same page table, this flag is needed by VT-d */ 
+    vtd_pte_present = is_epte_present(ept_entry) ? 1 : 0;
+
+    /*
+     * If we're here with i > target, we must be at a leaf node, and
+     * we need to break up the superpage.
+     *
+     * If we're here with i == target and i > 0, we need to check to see
+     * if we're replacing a non-leaf entry (i.e., pointing to an N-1 table)
+     * with a leaf entry (a 1GiB or 2MiB page), and handle things 
appropriately.
+     */
+
+    if ( i == target )
+    {
+        /* We reached the target level. */
+        ept_entry_t new_entry = { .epte = 0 };
+
+        /* No need to flush if the old entry wasn't valid */
+        if ( !is_epte_present(ept_entry) )
+            needs_sync = 0;
+
+        /* If we're replacing a non-leaf entry with a leaf entry (1GiB or 
2MiB),
+         * the intermediate tables will be freed below after the ept flush
+         *
+         * Read-then-write is OK because we hold the p2m lock. */
+        old_entry = *ept_entry;
+
+        if ( mfn_valid(mfn_x(mfn)) || direct_mmio || p2m_is_paged(p2mt) ||
+             (p2mt == p2m_ram_paging_in_start) )
+        {
+            /* Construct the new entry, and then write it once */
+            new_entry.emt = epte_get_entry_emt(p2m->domain, gfn, mfn, &ipat,
+                                                direct_mmio);
+
+            new_entry.ipat = ipat;
+            new_entry.sp = order ? 1 : 0;
+            new_entry.sa_p2mt = p2mt;
+            new_entry.access = p2ma;
+            new_entry.rsvd2_snp = (iommu_enabled && iommu_snoop);
+
+            new_entry.mfn = mfn_x(mfn);
+
+            if ( old_entry.mfn == new_entry.mfn )
+                need_modify_vtd_table = 0;
+
+            ept_p2m_type_to_flags(&new_entry, p2mt, p2ma);
+        }
+
+        atomic_write_ept_entry(ept_entry, new_entry);
+    }
+    else
+    {
+        /* We need to split the original page. */
+        ept_entry_t split_ept_entry;
+        ept_entry_t new_entry = { .epte = 0 };
+
+        ASSERT(is_epte_superpage(ept_entry));
+
+        split_ept_entry = atomic_read_ept_entry(ept_entry);
+
+        if ( !ept_split_super_page(p2m, &split_ept_entry, i, target) )
+        {
+            ept_free_entry(p2m, &split_ept_entry, i);
+            goto out;
+        }
+
+        /* now install the newly split ept sub-tree */
+        /* NB: please make sure domian is paused and no in-fly VT-d DMA. */
+        atomic_write_ept_entry(ept_entry, split_ept_entry);
+
+        /* then move to the level we want to make real changes */
+        for ( ; i > target; i-- )
+            ept_next_level(p2m, 0, &table, &gfn_remainder, i);
+
+        ASSERT(i == target);
+
+        index = gfn_remainder >> (i * EPT_TABLE_ORDER);
+        offset = gfn_remainder & ((1UL << (i * EPT_TABLE_ORDER)) - 1);
+
+        ept_entry = table + index;
+
+        new_entry.emt = epte_get_entry_emt(d, gfn, mfn, &ipat, direct_mmio);
+        new_entry.ipat = ipat;
+        new_entry.sp = i ? 1 : 0;
+        new_entry.sa_p2mt = p2mt;
+        new_entry.access = p2ma;
+        new_entry.rsvd2_snp = (iommu_enabled && iommu_snoop);
+
+        /* the caller should take care of the previous page */
+        new_entry.mfn = mfn_x(mfn);
+
+        /* Safe to read-then-write because we hold the p2m lock */
+        if ( ept_entry->mfn == new_entry.mfn )
+             need_modify_vtd_table = 0;
+
+        ept_p2m_type_to_flags(&new_entry, p2mt, p2ma);
+
+        atomic_write_ept_entry(ept_entry, new_entry);
+    }
+
+    /* Track the highest gfn for which we have ever had a valid mapping */
+    if ( mfn_valid(mfn_x(mfn)) &&
+         (gfn + (1UL << order) - 1 > p2m->max_mapped_pfn) )
+        p2m->max_mapped_pfn = gfn + (1UL << order) - 1;
+
+    /* Success */
+    rv = 1;
+
+out:
+    unmap_domain_page(table);
+
+    if ( needs_sync )
+        ept_sync_domain(p2m->domain);
+
+    if ( rv && iommu_enabled && need_iommu(p2m->domain) && 
need_modify_vtd_table )
+    {
+        if ( iommu_hap_pt_share )
+            iommu_pte_flush(d, gfn, (u64*)ept_entry, order, vtd_pte_present);
+        else
+        {
+            if ( p2mt == p2m_ram_rw )
+            {
+                if ( order > 0 )
+                {
+                    for ( i = 0; i < (1 << order); i++ )
+                        iommu_map_page(
+                            p2m->domain, gfn - offset + i, mfn_x(mfn) - offset 
+ i,
+                            IOMMUF_readable | IOMMUF_writable);
+                }
+                else if ( !order )
+                    iommu_map_page(
+                        p2m->domain, gfn, mfn_x(mfn), IOMMUF_readable | 
IOMMUF_writable);
+            }
+            else
+            {
+                if ( order > 0 )
+                {
+                    for ( i = 0; i < (1 << order); i++ )
+                        iommu_unmap_page(p2m->domain, gfn - offset + i);
+                }
+                else if ( !order )
+                    iommu_unmap_page(p2m->domain, gfn);
+            }
+        }
+    }
+
+    /* Release the old intermediate tables, if any.  This has to be the
+       last thing we do, after the ept_sync_domain() and removal
+       from the iommu tables, so as to avoid a potential
+       use-after-free. */
+    if ( is_epte_present(&old_entry) )
+        ept_free_entry(p2m, &old_entry, target);
+
+    return rv;
+}
+
+/* Read ept p2m entries */
+static mfn_t ept_get_entry(struct p2m_domain *p2m,
+                           unsigned long gfn, p2m_type_t *t, p2m_access_t* a,
+                           p2m_query_t q)
+{
+    struct domain *d = p2m->domain;
+    ept_entry_t *table = map_domain_page(ept_get_asr(d));
+    unsigned long gfn_remainder = gfn;
+    ept_entry_t *ept_entry;
+    u32 index;
+    int i;
+    int ret = 0;
+    mfn_t mfn = _mfn(INVALID_MFN);
+
+    *t = p2m_mmio_dm;
+    *a = p2m_access_n;
+
+    /* This pfn is higher than the highest the p2m map currently holds */
+    if ( gfn > p2m->max_mapped_pfn )
+        goto out;
+
+    /* Should check if gfn obeys GAW here. */
+
+    for ( i = ept_get_wl(d); i > 0; i-- )
+    {
+    retry:
+        ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i);
+        if ( !ret )
+            goto out;
+        else if ( ret == GUEST_TABLE_POD_PAGE )
+        {
+            if ( q == p2m_query )
+            {
+                *t = p2m_populate_on_demand;
+                goto out;
+            }
+
+            /* Populate this superpage */
+            ASSERT(i == 1);
+
+            index = gfn_remainder >> ( i * EPT_TABLE_ORDER);
+            ept_entry = table + index;
+
+            if ( !ept_pod_check_and_populate(p2m, gfn,
+                                             ept_entry, 9, q) )
+                goto retry;
+            else
+                goto out;
+        }
+        else if ( ret == GUEST_TABLE_SUPER_PAGE )
+            break;
+    }
+
+    index = gfn_remainder >> (i * EPT_TABLE_ORDER);
+    ept_entry = table + index;
+
+    if ( ept_entry->sa_p2mt == p2m_populate_on_demand )
+    {
+        if ( q == p2m_query )
+        {
+            *t = p2m_populate_on_demand;
+            goto out;
+        }
+
+        ASSERT(i == 0);
+        
+        if ( ept_pod_check_and_populate(p2m, gfn,
+                                        ept_entry, 0, q) )
+            goto out;
+    }
+
+    /* Need to check for all-zeroes because typecode 0 is p2m_ram and an
+     * entirely empty entry shouldn't have RAM type. */
+    if ( ept_entry->epte != 0 && ept_entry->sa_p2mt != p2m_invalid )
+    {
+        *t = ept_entry->sa_p2mt;
+        *a = ept_entry->access;
+
+        mfn = _mfn(ept_entry->mfn);
+        if ( i )
+        {
+            /* 
+             * We may meet super pages, and to split into 4k pages
+             * to emulate p2m table
+             */
+            unsigned long split_mfn = mfn_x(mfn) +
+                (gfn_remainder &
+                 ((1 << (i * EPT_TABLE_ORDER)) - 1));
+            mfn = _mfn(split_mfn);
+        }
+    }
+
+out:
+    unmap_domain_page(table);
+    return mfn;
+}
+
+/* WARNING: Only caller doesn't care about PoD pages.  So this function will
+ * always return 0 for PoD pages, not populate them.  If that becomes 
necessary,
+ * pass a p2m_query_t type along to distinguish. */
+static ept_entry_t ept_get_entry_content(struct p2m_domain *p2m,
+    unsigned long gfn, int *level)
+{
+    ept_entry_t *table = map_domain_page(ept_get_asr(p2m->domain));
+    unsigned long gfn_remainder = gfn;
+    ept_entry_t *ept_entry;
+    ept_entry_t content = { .epte = 0 };
+    u32 index;
+    int i;
+    int ret=0;
+
+    /* This pfn is higher than the highest the p2m map currently holds */
+    if ( gfn > p2m->max_mapped_pfn )
+        goto out;
+
+    for ( i = ept_get_wl(p2m->domain); i > 0; i-- )
+    {
+        ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i);
+        if ( !ret || ret == GUEST_TABLE_POD_PAGE )
+            goto out;
+        else if ( ret == GUEST_TABLE_SUPER_PAGE )
+            break;
+    }
+
+    index = gfn_remainder >> (i * EPT_TABLE_ORDER);
+    ept_entry = table + index;
+    content = *ept_entry;
+    *level = i;
+
+ out:
+    unmap_domain_page(table);
+    return content;
+}
+
+void ept_walk_table(struct domain *d, unsigned long gfn)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    ept_entry_t *table = map_domain_page(ept_get_asr(d));
+    unsigned long gfn_remainder = gfn;
+
+    int i;
+
+    gdprintk(XENLOG_ERR, "Walking EPT tables for domain %d gfn %lx\n",
+           d->domain_id, gfn);
+
+    /* This pfn is higher than the highest the p2m map currently holds */
+    if ( gfn > p2m->max_mapped_pfn )
+    {
+        gdprintk(XENLOG_ERR, " gfn exceeds max_mapped_pfn %lx\n",
+               p2m->max_mapped_pfn);
+        goto out;
+    }
+
+    for ( i = ept_get_wl(d); i >= 0; i-- )
+    {
+        ept_entry_t *ept_entry, *next;
+        u32 index;
+
+        /* Stolen from ept_next_level */
+        index = gfn_remainder >> (i*EPT_TABLE_ORDER);
+        ept_entry = table + index;
+
+        gdprintk(XENLOG_ERR, " epte %"PRIx64"\n", ept_entry->epte);
+
+        if ( (i == 0) || !is_epte_present(ept_entry) ||
+             is_epte_superpage(ept_entry) )
+            goto out;
+        else
+        {
+            gfn_remainder &= (1UL << (i*EPT_TABLE_ORDER)) - 1;
+
+            next = map_domain_page(ept_entry->mfn);
+
+            unmap_domain_page(table);
+
+            table = next;
+        }
+    }
+
+out:
+    unmap_domain_page(table);
+    return;
+}
+
+static mfn_t ept_get_entry_current(struct p2m_domain *p2m,
+                                   unsigned long gfn, p2m_type_t *t, 
p2m_access_t *a,
+                                   p2m_query_t q)
+{
+    return ept_get_entry(p2m, gfn, t, a, q);
+}
+
+/*
+ * To test if the new emt type is the same with old,
+ * return 1 to not to reset ept entry.
+ */
+static int need_modify_ept_entry(struct p2m_domain *p2m, unsigned long gfn,
+                                 mfn_t mfn, uint8_t o_ipat, uint8_t o_emt,
+                                 p2m_type_t p2mt)
+{
+    uint8_t ipat;
+    uint8_t emt;
+    bool_t direct_mmio = (p2mt == p2m_mmio_direct);
+
+    emt = epte_get_entry_emt(p2m->domain, gfn, mfn, &ipat, direct_mmio);
+
+    if ( (emt == o_emt) && (ipat == o_ipat) )
+        return 0;
+
+    return 1;
+}
+
+void ept_change_entry_emt_with_range(struct domain *d,
+                                     unsigned long start_gfn,
+                                     unsigned long end_gfn)
+{
+    unsigned long gfn;
+    ept_entry_t e;
+    mfn_t mfn;
+    int order = 0;
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    p2m_lock(p2m);
+    for ( gfn = start_gfn; gfn <= end_gfn; gfn++ )
+    {
+        int level = 0;
+        uint64_t trunk = 0;
+
+        e = ept_get_entry_content(p2m, gfn, &level);
+        if ( !p2m_has_emt(e.sa_p2mt) )
+            continue;
+
+        order = 0;
+        mfn = _mfn(e.mfn);
+
+        if ( is_epte_superpage(&e) )
+        {
+            while ( level )
+            {
+                trunk = (1UL << (level * EPT_TABLE_ORDER)) - 1;
+                if ( !(gfn & trunk) && (gfn + trunk <= end_gfn) )
+                {
+                    /* gfn assigned with 2M or 1G, and the end covers more than
+                     * the super page areas.
+                     * Set emt for super page.
+                     */
+                    order = level * EPT_TABLE_ORDER;
+                    if ( need_modify_ept_entry(p2m, gfn, mfn, 
+                          e.ipat, e.emt, e.sa_p2mt) )
+                        ept_set_entry(p2m, gfn, mfn, order, e.sa_p2mt, 
e.access);
+                    gfn += trunk;
+                    break;
+                }
+                level--;
+             }
+        }
+        else /* gfn assigned with 4k */
+        {
+            if ( need_modify_ept_entry(p2m, gfn, mfn, e.ipat, e.emt, 
e.sa_p2mt) )
+                ept_set_entry(p2m, gfn, mfn, order, e.sa_p2mt, e.access);
+        }
+    }
+    p2m_unlock(p2m);
+}
+
+/*
+ * Walk the whole p2m table, changing any entries of the old type
+ * to the new type.  This is used in hardware-assisted paging to
+ * quickly enable or diable log-dirty tracking
+ */
+static void ept_change_entry_type_page(mfn_t ept_page_mfn, int ept_page_level,
+                                       p2m_type_t ot, p2m_type_t nt)
+{
+    ept_entry_t e, *epte = map_domain_page(mfn_x(ept_page_mfn));
+
+    for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
+    {
+        if ( !is_epte_present(epte + i) )
+            continue;
+
+        if ( (ept_page_level > 0) && !is_epte_superpage(epte + i) )
+            ept_change_entry_type_page(_mfn(epte[i].mfn),
+                                       ept_page_level - 1, ot, nt);
+        else
+        {
+            e = atomic_read_ept_entry(&epte[i]);
+            if ( e.sa_p2mt != ot )
+                continue;
+
+            e.sa_p2mt = nt;
+            ept_p2m_type_to_flags(&e, nt, e.access);
+            atomic_write_ept_entry(&epte[i], e);
+        }
+    }
+
+    unmap_domain_page(epte);
+}
+
+static void ept_change_entry_type_global(struct p2m_domain *p2m,
+                                         p2m_type_t ot, p2m_type_t nt)
+{
+    struct domain *d = p2m->domain;
+    if ( ept_get_asr(d) == 0 )
+        return;
+
+    BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
+    BUG_ON(ot != nt && (ot == p2m_mmio_direct || nt == p2m_mmio_direct));
+
+    ept_change_entry_type_page(_mfn(ept_get_asr(d)), ept_get_wl(d), ot, nt);
+
+    ept_sync_domain(d);
+}
+
+void ept_p2m_init(struct domain *d)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    p2m->set_entry = ept_set_entry;
+    p2m->get_entry = ept_get_entry;
+    p2m->get_entry_current = ept_get_entry_current;
+    p2m->change_entry_type_global = ept_change_entry_type_global;
+}
+
+static void ept_dump_p2m_table(unsigned char key)
+{
+    struct domain *d;
+    ept_entry_t *table, *ept_entry;
+    mfn_t mfn;
+    int order;
+    int i;
+    int is_pod;
+    int ret = 0;
+    unsigned long index;
+    unsigned long gfn, gfn_remainder;
+    unsigned long record_counter = 0;
+    struct p2m_domain *p2m;
+
+    for_each_domain(d)
+    {
+        if ( !hap_enabled(d) )
+            continue;
+
+        p2m = p2m_get_hostp2m(d);
+        printk("\ndomain%d EPT p2m table: \n", d->domain_id);
+
+        for ( gfn = 0; gfn <= p2m->max_mapped_pfn; gfn += (1 << order) )
+        {
+            gfn_remainder = gfn;
+            mfn = _mfn(INVALID_MFN);
+            table = map_domain_page(ept_get_asr(d));
+
+            for ( i = ept_get_wl(d); i > 0; i-- )
+            {
+                ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i);
+                if ( ret != GUEST_TABLE_NORMAL_PAGE )
+                    break;
+            }
+
+            order = i * EPT_TABLE_ORDER;
+
+            if ( ret == GUEST_TABLE_MAP_FAILED )
+                goto out;
+
+            index = gfn_remainder >> order;
+            ept_entry = table + index;
+            if ( ept_entry->sa_p2mt != p2m_invalid )
+            {
+                ( ept_entry->sa_p2mt == p2m_populate_on_demand ) ? 
+                ( mfn = _mfn(INVALID_MFN), is_pod = 1 ) :
+                ( mfn = _mfn(ept_entry->mfn), is_pod = 0 );
+
+                printk("gfn: %-16lx  mfn: %-16lx  order: %2d  is_pod: %d\n",
+                       gfn, mfn_x(mfn), order, is_pod);
+
+                if ( !(record_counter++ % 100) )
+                    process_pending_softirqs();
+            }
+out:
+            unmap_domain_page(table);
+        }
+    }
+}
+
+static struct keyhandler ept_p2m_table = {
+    .diagnostic = 0,
+    .u.fn = ept_dump_p2m_table,
+    .desc = "dump ept p2m table"
+};
+
+void setup_ept_dump(void)
+{
+    register_keyhandler('D', &ept_p2m_table);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 4b0692880dfa -r 26c4beb6b520 xen/arch/x86/mm/p2m-pod.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/mm/p2m-pod.c Fri May 06 11:15:35 2011 +0100
@@ -0,0 +1,1151 @@
+/******************************************************************************
+ * arch/x86/mm/p2m-pod.c
+ *
+ * Populate-on-demand p2m entries. 
+ *
+ * Copyright (c) 2009-2011 Citrix Systems, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <asm/domain.h>
+#include <asm/page.h>
+#include <asm/paging.h>
+#include <asm/p2m.h>
+#include <asm/hvm/vmx/vmx.h> /* ept_p2m_init() */
+#include <xen/iommu.h>
+#include <asm/mem_event.h>
+#include <public/mem_event.h>
+#include <asm/mem_sharing.h>
+#include <xen/event.h>
+#include <asm/hvm/nestedhvm.h>
+#include <asm/hvm/svm/amd-iommu-proto.h>
+ 
+/* Printouts */
+#define P2M_PRINTK(_f, _a...)                                \
+    debugtrace_printk("p2m: %s(): " _f, __func__, ##_a)
+#define P2M_ERROR(_f, _a...)                                 \
+    printk("pg error: %s(): " _f, __func__, ##_a)
+#if P2M_DEBUGGING
+#define P2M_DEBUG(_f, _a...)                                 \
+    debugtrace_printk("p2mdebug: %s(): " _f, __func__, ##_a)
+#else
+#define P2M_DEBUG(_f, _a...) do { (void)(_f); } while(0)
+#endif
+
+
+/* Override macros from asm/page.h to make them work with mfn_t */
+#undef mfn_to_page
+#define mfn_to_page(_m) __mfn_to_page(mfn_x(_m))
+#undef mfn_valid
+#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
+#undef page_to_mfn
+#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
+
+#if P2M_AUDIT
+extern void audit_p2m(struct p2m_domain *p2m, int strict_m2p);
+#else
+# define audit_p2m(_p2m, _m2p) do { (void)(_p2m),(_m2p); } while (0)
+#endif /* P2M_AUDIT */
+
+#define SUPERPAGE_PAGES (1UL << 9)
+#define superpage_aligned(_x)  (((_x)&(SUPERPAGE_PAGES-1))==0)
+
+/*
+ * Populate-on-demand functionality
+ */
+
+static int
+p2m_pod_cache_add(struct p2m_domain *p2m,
+                  struct page_info *page,
+                  unsigned long order)
+{
+    int i;
+    struct page_info *p;
+    struct domain *d = p2m->domain;
+
+#ifndef NDEBUG
+    mfn_t mfn;
+
+    mfn = page_to_mfn(page);
+
+    /* Check to make sure this is a contiguous region */
+    if( mfn_x(mfn) & ((1 << order) - 1) )
+    {
+        printk("%s: mfn %lx not aligned order %lu! (mask %lx)\n",
+               __func__, mfn_x(mfn), order, ((1UL << order) - 1));
+        return -1;
+    }
+    
+    for(i=0; i < 1 << order ; i++) {
+        struct domain * od;
+
+        p = mfn_to_page(_mfn(mfn_x(mfn) + i));
+        od = page_get_owner(p);
+        if(od != d)
+        {
+            printk("%s: mfn %lx expected owner d%d, got owner d%d!\n",
+                   __func__, mfn_x(mfn), d->domain_id,
+                   od?od->domain_id:-1);
+            return -1;
+        }
+    }
+#endif
+
+    ASSERT(p2m_locked_by_me(p2m));
+
+    /*
+     * Pages from domain_alloc and returned by the balloon driver aren't
+     * guaranteed to be zero; but by reclaiming zero pages, we implicitly
+     * promise to provide zero pages. So we scrub pages before using.
+     */
+    for ( i = 0; i < (1 << order); i++ )
+    {
+        char *b = map_domain_page(mfn_x(page_to_mfn(page)) + i);
+        clear_page(b);
+        unmap_domain_page(b);
+    }
+
+    spin_lock(&d->page_alloc_lock);
+
+    /* First, take all pages off the domain list */
+    for(i=0; i < 1 << order ; i++)
+    {
+        p = page + i;
+        page_list_del(p, &d->page_list);
+    }
+
+    /* Then add the first one to the appropriate populate-on-demand list */
+    switch(order)
+    {
+    case 9:
+        page_list_add_tail(page, &p2m->pod.super); /* lock: page_alloc */
+        p2m->pod.count += 1 << order;
+        break;
+    case 0:
+        page_list_add_tail(page, &p2m->pod.single); /* lock: page_alloc */
+        p2m->pod.count += 1;
+        break;
+    default:
+        BUG();
+    }
+
+    /* Ensure that the PoD cache has never been emptied.  
+     * This may cause "zombie domains" since the page will never be freed. */
+    BUG_ON( d->arch.relmem != RELMEM_not_started );
+
+    spin_unlock(&d->page_alloc_lock);
+
+    return 0;
+}
+
+/* Get a page of size order from the populate-on-demand cache.  Will break
+ * down 2-meg pages into singleton pages automatically.  Returns null if
+ * a superpage is requested and no superpages are available.  Must be called
+ * with the d->page_lock held. */
+static struct page_info * p2m_pod_cache_get(struct p2m_domain *p2m,
+                                            unsigned long order)
+{
+    struct page_info *p = NULL;
+    int i;
+
+    if ( order == 9 && page_list_empty(&p2m->pod.super) )
+    {
+        return NULL;
+    }
+    else if ( order == 0 && page_list_empty(&p2m->pod.single) )
+    {
+        unsigned long mfn;
+        struct page_info *q;
+
+        BUG_ON( page_list_empty(&p2m->pod.super) );
+
+        /* Break up a superpage to make single pages. NB count doesn't
+         * need to be adjusted. */
+        p = page_list_remove_head(&p2m->pod.super);
+        mfn = mfn_x(page_to_mfn(p));
+
+        for ( i=0; i<SUPERPAGE_PAGES; i++ )
+        {
+            q = mfn_to_page(_mfn(mfn+i));
+            page_list_add_tail(q, &p2m->pod.single);
+        }
+    }
+
+    switch ( order )
+    {
+    case 9:
+        BUG_ON( page_list_empty(&p2m->pod.super) );
+        p = page_list_remove_head(&p2m->pod.super);
+        p2m->pod.count -= 1 << order; /* Lock: page_alloc */
+        break;
+    case 0:
+        BUG_ON( page_list_empty(&p2m->pod.single) );
+        p = page_list_remove_head(&p2m->pod.single);
+        p2m->pod.count -= 1;
+        break;
+    default:
+        BUG();
+    }
+
+    /* Put the pages back on the domain page_list */
+    for ( i = 0 ; i < (1 << order); i++ )
+    {
+        BUG_ON(page_get_owner(p + i) != p2m->domain);
+        page_list_add_tail(p + i, &p2m->domain->page_list);
+    }
+
+    return p;
+}
+
+/* Set the size of the cache, allocating or freeing as necessary. */
+static int
+p2m_pod_set_cache_target(struct p2m_domain *p2m, unsigned long pod_target, int 
preemptible)
+{
+    struct domain *d = p2m->domain;
+    int ret = 0;
+
+    /* Increasing the target */
+    while ( pod_target > p2m->pod.count )
+    {
+        struct page_info * page;
+        int order;
+
+        if ( (pod_target - p2m->pod.count) >= SUPERPAGE_PAGES )
+            order = 9;
+        else
+            order = 0;
+    retry:
+        page = alloc_domheap_pages(d, order, 0);
+        if ( unlikely(page == NULL) )
+        {
+            if ( order == 9 )
+            {
+                /* If we can't allocate a superpage, try singleton pages */
+                order = 0;
+                goto retry;
+            }   
+            
+            printk("%s: Unable to allocate domheap page for pod cache.  target 
%lu cachesize %d\n",
+                   __func__, pod_target, p2m->pod.count);
+            ret = -ENOMEM;
+            goto out;
+        }
+
+        p2m_pod_cache_add(p2m, page, order);
+
+        if ( hypercall_preempt_check() && preemptible )
+        {
+            ret = -EAGAIN;
+            goto out;
+        }
+    }
+
+    /* Decreasing the target */
+    /* We hold the p2m lock here, so we don't need to worry about
+     * cache disappearing under our feet. */
+    while ( pod_target < p2m->pod.count )
+    {
+        struct page_info * page;
+        int order, i;
+
+        /* Grab the lock before checking that pod.super is empty, or the last
+         * entries may disappear before we grab the lock. */
+        spin_lock(&d->page_alloc_lock);
+
+        if ( (p2m->pod.count - pod_target) > SUPERPAGE_PAGES
+             && !page_list_empty(&p2m->pod.super) )
+            order = 9;
+        else
+            order = 0;
+
+        page = p2m_pod_cache_get(p2m, order);
+
+        ASSERT(page != NULL);
+
+        spin_unlock(&d->page_alloc_lock);
+
+        /* Then free them */
+        for ( i = 0 ; i < (1 << order) ; i++ )
+        {
+            /* Copied from common/memory.c:guest_remove_page() */
+            if ( unlikely(!get_page(page+i, d)) )
+            {
+                gdprintk(XENLOG_INFO, "Bad page free for domain %u\n", 
d->domain_id);
+                ret = -EINVAL;
+                goto out;
+            }
+
+            if ( test_and_clear_bit(_PGT_pinned, &(page+i)->u.inuse.type_info) 
)
+                put_page_and_type(page+i);
+            
+            if ( test_and_clear_bit(_PGC_allocated, &(page+i)->count_info) )
+                put_page(page+i);
+
+            put_page(page+i);
+
+            if ( hypercall_preempt_check() && preemptible )
+            {
+                ret = -EAGAIN;
+                goto out;
+            }
+        }
+    }
+
+out:
+    return ret;
+}
+
+/*
+ * The "right behavior" here requires some careful thought.  First, some
+ * definitions:
+ * + M: static_max
+ * + B: number of pages the balloon driver has ballooned down to.
+ * + P: Number of populated pages. 
+ * + T: Old target
+ * + T': New target
+ *
+ * The following equations should hold:
+ *  0 <= P <= T <= B <= M
+ *  d->arch.p2m->pod.entry_count == B - P
+ *  d->tot_pages == P + d->arch.p2m->pod.count
+ *
+ * Now we have the following potential cases to cover:
+ *     B <T': Set the PoD cache size equal to the number of outstanding PoD
+ *   entries.  The balloon driver will deflate the balloon to give back
+ *   the remainder of the ram to the guest OS.
+ *  T <T'<B : Increase PoD cache size.
+ *  T'<T<=B : Here we have a choice.  We can decrease the size of the cache,
+ *   get the memory right away.  However, that means every time we 
+ *   reduce the memory target we risk the guest attempting to populate the 
+ *   memory before the balloon driver has reached its new target.  Safer to
+ *   never reduce the cache size here, but only when the balloon driver frees 
+ *   PoD ranges.
+ *
+ * If there are many zero pages, we could reach the target also by doing
+ * zero sweeps and marking the ranges PoD; but the balloon driver will have
+ * to free this memory eventually anyway, so we don't actually gain that much
+ * by doing so.
+ *
+ * NB that the equation (B<T') may require adjustment to the cache
+ * size as PoD pages are freed as well; i.e., freeing a PoD-backed
+ * entry when pod.entry_count == pod.count requires us to reduce both
+ * pod.entry_count and pod.count.
+ */
+int
+p2m_pod_set_mem_target(struct domain *d, unsigned long target)
+{
+    unsigned pod_target;
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    int ret = 0;
+    unsigned long populated;
+
+    p2m_lock(p2m);
+
+    /* P == B: Nothing to do. */
+    if ( p2m->pod.entry_count == 0 )
+        goto out;
+
+    /* Don't do anything if the domain is being torn down */
+    if ( d->is_dying )
+        goto out;
+
+    /* T' < B: Don't reduce the cache size; let the balloon driver
+     * take care of it. */
+    if ( target < d->tot_pages )
+        goto out;
+
+    populated  = d->tot_pages - p2m->pod.count;
+
+    pod_target = target - populated;
+
+    /* B < T': Set the cache size equal to # of outstanding entries,
+     * let the balloon driver fill in the rest. */
+    if ( pod_target > p2m->pod.entry_count )
+        pod_target = p2m->pod.entry_count;
+
+    ASSERT( pod_target >= p2m->pod.count );
+
+    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
+
+out:
+    p2m_unlock(p2m);
+
+    return ret;
+}
+
+void
+p2m_pod_empty_cache(struct domain *d)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    struct page_info *page;
+
+    /* After this barrier no new PoD activities can happen. */
+    BUG_ON(!d->is_dying);
+    spin_barrier(&p2m->lock);
+
+    spin_lock(&d->page_alloc_lock);
+
+    while ( (page = page_list_remove_head(&p2m->pod.super)) )
+    {
+        int i;
+            
+        for ( i = 0 ; i < SUPERPAGE_PAGES ; i++ )
+        {
+            BUG_ON(page_get_owner(page + i) != d);
+            page_list_add_tail(page + i, &d->page_list);
+        }
+
+        p2m->pod.count -= SUPERPAGE_PAGES;
+    }
+
+    while ( (page = page_list_remove_head(&p2m->pod.single)) )
+    {
+        BUG_ON(page_get_owner(page) != d);
+        page_list_add_tail(page, &d->page_list);
+
+        p2m->pod.count -= 1;
+    }
+
+    BUG_ON(p2m->pod.count != 0);
+
+    spin_unlock(&d->page_alloc_lock);
+}
+
+int
+p2m_pod_offline_or_broken_hit(struct page_info *p)
+{
+    struct domain *d;
+    struct p2m_domain *p2m;
+    struct page_info *q, *tmp;
+    unsigned long mfn, bmfn;
+
+    if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) )
+        return 0;
+
+    spin_lock(&d->page_alloc_lock);
+    bmfn = mfn_x(page_to_mfn(p));
+    page_list_for_each_safe(q, tmp, &p2m->pod.super)
+    {
+        mfn = mfn_x(page_to_mfn(q));
+        if ( (bmfn >= mfn) && ((bmfn - mfn) < SUPERPAGE_PAGES) )
+        {
+            unsigned long i;
+            page_list_del(q, &p2m->pod.super);
+            for ( i = 0; i < SUPERPAGE_PAGES; i++)
+            {
+                q = mfn_to_page(_mfn(mfn + i));
+                page_list_add_tail(q, &p2m->pod.single);
+            }
+            page_list_del(p, &p2m->pod.single);
+            p2m->pod.count--;
+            goto pod_hit;
+        }
+    }
+
+    page_list_for_each_safe(q, tmp, &p2m->pod.single)
+    {
+        mfn = mfn_x(page_to_mfn(q));
+        if ( mfn == bmfn )
+        {
+            page_list_del(p, &p2m->pod.single);
+            p2m->pod.count--;
+            goto pod_hit;
+        }
+    }
+
+    spin_unlock(&d->page_alloc_lock);
+    return 0;
+
+pod_hit:
+    page_list_add_tail(p, &d->arch.relmem_list);
+    spin_unlock(&d->page_alloc_lock);
+    return 1;
+}
+
+void
+p2m_pod_offline_or_broken_replace(struct page_info *p)
+{
+    struct domain *d;
+    struct p2m_domain *p2m;
+
+    if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) )
+        return;
+
+    free_domheap_page(p);
+
+    p = alloc_domheap_page(d, 0);
+    if ( unlikely(!p) )
+        return;
+
+    p2m_lock(p2m);
+    p2m_pod_cache_add(p2m, p, 0);
+    p2m_unlock(p2m);
+    return;
+}
+
+/* This function is needed for two reasons:
+ * + To properly handle clearing of PoD entries
+ * + To "steal back" memory being freed for the PoD cache, rather than
+ *   releasing it.
+ *
+ * Once both of these functions have been completed, we can return and
+ * allow decrease_reservation() to handle everything else.
+ */
+int
+p2m_pod_decrease_reservation(struct domain *d,
+                             xen_pfn_t gpfn,
+                             unsigned int order)
+{
+    int ret=0;
+    int i;
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    int steal_for_cache = 0;
+    int pod = 0, nonpod = 0, ram = 0;
+    
+
+    /* If we don't have any outstanding PoD entries, let things take their
+     * course */
+    if ( p2m->pod.entry_count == 0 )
+        goto out;
+
+    /* Figure out if we need to steal some freed memory for our cache */
+    steal_for_cache =  ( p2m->pod.entry_count > p2m->pod.count );
+
+    p2m_lock(p2m);
+    audit_p2m(p2m, 1);
+
+    if ( unlikely(d->is_dying) )
+        goto out_unlock;
+
+    /* See what's in here. */
+    /* FIXME: Add contiguous; query for PSE entries? */
+    for ( i=0; i<(1<<order); i++)
+    {
+        p2m_type_t t;
+
+        gfn_to_mfn_query(p2m, gpfn + i, &t);
+
+        if ( t == p2m_populate_on_demand )
+            pod++;
+        else
+        {
+            nonpod++;
+            if ( p2m_is_ram(t) )
+                ram++;
+        }
+    }
+
+    /* No populate-on-demand?  Don't need to steal anything?  Then we're 
done!*/
+    if(!pod && !steal_for_cache)
+        goto out_unlock;
+
+    if ( !nonpod )
+    {
+        /* All PoD: Mark the whole region invalid and tell caller
+         * we're done. */
+        set_p2m_entry(p2m, gpfn, _mfn(INVALID_MFN), order, p2m_invalid, 
p2m->default_access);
+        p2m->pod.entry_count-=(1<<order); /* Lock: p2m */
+        BUG_ON(p2m->pod.entry_count < 0);
+        ret = 1;
+        goto out_entry_check;
+    }
+
+    /* FIXME: Steal contig 2-meg regions for cache */
+
+    /* Process as long as:
+     * + There are PoD entries to handle, or
+     * + There is ram left, and we want to steal it
+     */
+    for ( i=0;
+          i<(1<<order) && (pod>0 || (steal_for_cache && ram > 0));
+          i++)
+    {
+        mfn_t mfn;
+        p2m_type_t t;
+
+        mfn = gfn_to_mfn_query(p2m, gpfn + i, &t);
+        if ( t == p2m_populate_on_demand )
+        {
+            set_p2m_entry(p2m, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid, 
p2m->default_access);
+            p2m->pod.entry_count--; /* Lock: p2m */
+            BUG_ON(p2m->pod.entry_count < 0);
+            pod--;
+        }
+        else if ( steal_for_cache && p2m_is_ram(t) )
+        {
+            struct page_info *page;
+
+            ASSERT(mfn_valid(mfn));
+
+            page = mfn_to_page(mfn);
+
+            set_p2m_entry(p2m, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid, 
p2m->default_access);
+            set_gpfn_from_mfn(mfn_x(mfn), INVALID_M2P_ENTRY);
+
+            p2m_pod_cache_add(p2m, page, 0);
+
+            steal_for_cache =  ( p2m->pod.entry_count > p2m->pod.count );
+
+            nonpod--;
+            ram--;
+        }
+    }    
+
+    /* If there are no more non-PoD entries, tell decrease_reservation() that
+     * there's nothing left to do. */
+    if ( nonpod == 0 )
+        ret = 1;
+
+out_entry_check:
+    /* If we've reduced our "liabilities" beyond our "assets", free some */
+    if ( p2m->pod.entry_count < p2m->pod.count )
+    {
+        p2m_pod_set_cache_target(p2m, p2m->pod.entry_count, 0/*can't 
preempt*/);
+    }
+
+out_unlock:
+    audit_p2m(p2m, 1);
+    p2m_unlock(p2m);
+
+out:
+    return ret;
+}
+
+void
+p2m_pod_dump_data(struct p2m_domain *p2m)
+{
+    printk("    PoD entries=%d cachesize=%d\n",
+           p2m->pod.entry_count, p2m->pod.count);
+}
+
+
+/* Search for all-zero superpages to be reclaimed as superpages for the
+ * PoD cache. Must be called w/ p2m lock held, page_alloc lock not held. */
+static int
+p2m_pod_zero_check_superpage(struct p2m_domain *p2m, unsigned long gfn)
+{
+    mfn_t mfn, mfn0 = _mfn(INVALID_MFN);
+    p2m_type_t type, type0 = 0;
+    unsigned long * map = NULL;
+    int ret=0, reset = 0;
+    int i, j;
+    int max_ref = 1;
+    struct domain *d = p2m->domain;
+
+    if ( !superpage_aligned(gfn) )
+        goto out;
+
+    /* Allow an extra refcount for one shadow pt mapping in shadowed domains */
+    if ( paging_mode_shadow(d) )
+        max_ref++;
+
+    /* Look up the mfns, checking to make sure they're the same mfn
+     * and aligned, and mapping them. */
+    for ( i=0; i<SUPERPAGE_PAGES; i++ )
+    {
+        
+        mfn = gfn_to_mfn_query(p2m, gfn + i, &type);
+
+        if ( i == 0 )
+        {
+            mfn0 = mfn;
+            type0 = type;
+        }
+
+        /* Conditions that must be met for superpage-superpage:
+         * + All gfns are ram types
+         * + All gfns have the same type
+         * + All of the mfns are allocated to a domain
+         * + None of the mfns are used as pagetables, or allocated via xenheap
+         * + The first mfn is 2-meg aligned
+         * + All the other mfns are in sequence
+         * Adding for good measure:
+         * + None of the mfns are likely to be mapped elsewhere (refcount
+         *   2 or less for shadow, 1 for hap)
+         */
+        if ( !p2m_is_ram(type)
+             || type != type0
+             || ( (mfn_to_page(mfn)->count_info & PGC_allocated) == 0 )
+             || ( (mfn_to_page(mfn)->count_info & 
(PGC_page_table|PGC_xen_heap)) != 0 )
+             || ( (mfn_to_page(mfn)->count_info & PGC_xen_heap  ) != 0 )
+             || ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > max_ref )
+             || !( ( i == 0 && superpage_aligned(mfn_x(mfn0)) )
+                   || ( i != 0 && mfn_x(mfn) == (mfn_x(mfn0) + i) ) ) )
+            goto out;
+    }
+
+    /* Now, do a quick check to see if it may be zero before unmapping. */
+    for ( i=0; i<SUPERPAGE_PAGES; i++ )
+    {
+        /* Quick zero-check */
+        map = map_domain_page(mfn_x(mfn0) + i);
+
+        for ( j=0; j<16; j++ )
+            if( *(map+j) != 0 )
+                break;
+
+        unmap_domain_page(map);
+
+        if ( j < 16 )
+            goto out;
+
+    }
+
+    /* Try to remove the page, restoring old mapping if it fails. */
+    set_p2m_entry(p2m, gfn,
+                  _mfn(POPULATE_ON_DEMAND_MFN), 9,
+                  p2m_populate_on_demand, p2m->default_access);
+
+    /* Make none of the MFNs are used elsewhere... for example, mapped
+     * via the grant table interface, or by qemu.  Allow one refcount for
+     * being allocated to the domain. */
+    for ( i=0; i < SUPERPAGE_PAGES; i++ )
+    {
+        mfn = _mfn(mfn_x(mfn0) + i);
+        if ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > 1 )
+        {
+            reset = 1;
+            goto out_reset;
+        }
+    }
+
+    /* Finally, do a full zero-check */
+    for ( i=0; i < SUPERPAGE_PAGES; i++ )
+    {
+        map = map_domain_page(mfn_x(mfn0) + i);
+
+        for ( j=0; j<PAGE_SIZE/sizeof(*map); j++ )
+            if( *(map+j) != 0 )
+            {
+                reset = 1;
+                break;
+            }
+
+        unmap_domain_page(map);
+
+        if ( reset )
+            goto out_reset;
+    }
+
+    if ( tb_init_done )
+    {
+        struct {
+            u64 gfn, mfn;
+            int d:16,order:16;
+        } t;
+
+        t.gfn = gfn;
+        t.mfn = mfn_x(mfn);
+        t.d = d->domain_id;
+        t.order = 9;
+
+        __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t);
+    }
+
+    /* Finally!  We've passed all the checks, and can add the mfn superpage
+     * back on the PoD cache, and account for the new p2m PoD entries */
+    p2m_pod_cache_add(p2m, mfn_to_page(mfn0), 9);
+    p2m->pod.entry_count += SUPERPAGE_PAGES;
+
+out_reset:
+    if ( reset )
+        set_p2m_entry(p2m, gfn, mfn0, 9, type0, p2m->default_access);
+    
+out:
+    return ret;
+}
+
+static void
+p2m_pod_zero_check(struct p2m_domain *p2m, unsigned long *gfns, int count)
+{
+    mfn_t mfns[count];
+    p2m_type_t types[count];
+    unsigned long * map[count];
+    struct domain *d = p2m->domain;
+
+    int i, j;
+    int max_ref = 1;
+
+    /* Allow an extra refcount for one shadow pt mapping in shadowed domains */
+    if ( paging_mode_shadow(d) )
+        max_ref++;
+
+    /* First, get the gfn list, translate to mfns, and map the pages. */
+    for ( i=0; i<count; i++ )
+    {
+        mfns[i] = gfn_to_mfn_query(p2m, gfns[i], types + i);
+        /* If this is ram, and not a pagetable or from the xen heap, and 
probably not mapped
+           elsewhere, map it; otherwise, skip. */
+        if ( p2m_is_ram(types[i])
+             && ( (mfn_to_page(mfns[i])->count_info & PGC_allocated) != 0 ) 
+             && ( (mfn_to_page(mfns[i])->count_info & 
(PGC_page_table|PGC_xen_heap)) == 0 ) 
+             && ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) <= 
max_ref ) )
+            map[i] = map_domain_page(mfn_x(mfns[i]));
+        else
+            map[i] = NULL;
+    }
+
+    /* Then, go through and check for zeroed pages, removing write permission
+     * for those with zeroes. */
+    for ( i=0; i<count; i++ )
+    {
+        if(!map[i])
+            continue;
+
+        /* Quick zero-check */
+        for ( j=0; j<16; j++ )
+            if( *(map[i]+j) != 0 )
+                break;
+
+        if ( j < 16 )
+        {
+            unmap_domain_page(map[i]);
+            map[i] = NULL;
+            continue;
+        }
+
+        /* Try to remove the page, restoring old mapping if it fails. */
+        set_p2m_entry(p2m, gfns[i],
+                      _mfn(POPULATE_ON_DEMAND_MFN), 0,
+                      p2m_populate_on_demand, p2m->default_access);
+
+        /* See if the page was successfully unmapped.  (Allow one refcount
+         * for being allocated to a domain.) */
+        if ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) > 1 )
+        {
+            unmap_domain_page(map[i]);
+            map[i] = NULL;
+
+            set_p2m_entry(p2m, gfns[i], mfns[i], 0, types[i], 
p2m->default_access);
+
+            continue;
+        }
+    }
+
+    /* Now check each page for real */
+    for ( i=0; i < count; i++ )
+    {
+        if(!map[i])
+            continue;
+
+        for ( j=0; j<PAGE_SIZE/sizeof(*map[i]); j++ )
+            if( *(map[i]+j) != 0 )
+                break;
+
+        unmap_domain_page(map[i]);
+
+        /* See comment in p2m_pod_zero_check_superpage() re gnttab
+         * check timing.  */
+        if ( j < PAGE_SIZE/sizeof(*map[i]) )
+        {
+            set_p2m_entry(p2m, gfns[i], mfns[i], 0, types[i], 
p2m->default_access);
+        }
+        else
+        {
+            if ( tb_init_done )
+            {
+                struct {
+                    u64 gfn, mfn;
+                    int d:16,order:16;
+                } t;
+
+                t.gfn = gfns[i];
+                t.mfn = mfn_x(mfns[i]);
+                t.d = d->domain_id;
+                t.order = 0;
+        
+                __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t);
+            }
+
+            /* Add to cache, and account for the new p2m PoD entry */
+            p2m_pod_cache_add(p2m, mfn_to_page(mfns[i]), 0);
+            p2m->pod.entry_count++;
+        }
+    }
+    
+}
+
+#define POD_SWEEP_LIMIT 1024
+static void
+p2m_pod_emergency_sweep_super(struct p2m_domain *p2m)
+{
+    unsigned long i, start, limit;
+
+    if ( p2m->pod.reclaim_super == 0 )
+    {
+        p2m->pod.reclaim_super = (p2m->pod.max_guest>>9)<<9;
+        p2m->pod.reclaim_super -= SUPERPAGE_PAGES;
+    }
+    
+    start = p2m->pod.reclaim_super;
+    limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0;
+
+    for ( i=p2m->pod.reclaim_super ; i > 0 ; i -= SUPERPAGE_PAGES )
+    {
+        p2m_pod_zero_check_superpage(p2m, i);
+        /* Stop if we're past our limit and we have found *something*.
+         *
+         * NB that this is a zero-sum game; we're increasing our cache size
+         * by increasing our 'debt'.  Since we hold the p2m lock,
+         * (entry_count - count) must remain the same. */
+        if ( !page_list_empty(&p2m->pod.super) &&  i < limit )
+            break;
+    }
+
+    p2m->pod.reclaim_super = i ? i - SUPERPAGE_PAGES : 0;
+}
+
+#define POD_SWEEP_STRIDE  16
+static void
+p2m_pod_emergency_sweep(struct p2m_domain *p2m)
+{
+    unsigned long gfns[POD_SWEEP_STRIDE];
+    unsigned long i, j=0, start, limit;
+    p2m_type_t t;
+
+
+    if ( p2m->pod.reclaim_single == 0 )
+        p2m->pod.reclaim_single = p2m->pod.max_guest;
+
+    start = p2m->pod.reclaim_single;
+    limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0;
+
+    /* FIXME: Figure out how to avoid superpages */
+    for ( i=p2m->pod.reclaim_single; i > 0 ; i-- )
+    {
+        gfn_to_mfn_query(p2m, i, &t );
+        if ( p2m_is_ram(t) )
+        {
+            gfns[j] = i;
+            j++;
+            BUG_ON(j > POD_SWEEP_STRIDE);
+            if ( j == POD_SWEEP_STRIDE )
+            {
+                p2m_pod_zero_check(p2m, gfns, j);
+                j = 0;
+            }
+        }
+        /* Stop if we're past our limit and we have found *something*.
+         *
+         * NB that this is a zero-sum game; we're increasing our cache size
+         * by re-increasing our 'debt'.  Since we hold the p2m lock,
+         * (entry_count - count) must remain the same. */
+        if ( p2m->pod.count > 0 && i < limit )
+            break;
+    }
+
+    if ( j )
+        p2m_pod_zero_check(p2m, gfns, j);
+
+    p2m->pod.reclaim_single = i ? i - 1 : i;
+
+}
+
+int
+p2m_pod_demand_populate(struct p2m_domain *p2m, unsigned long gfn,
+                        unsigned int order,
+                        p2m_query_t q)
+{
+    struct domain *d = p2m->domain;
+    struct page_info *p = NULL; /* Compiler warnings */
+    unsigned long gfn_aligned;
+    mfn_t mfn;
+    int i;
+
+    ASSERT(p2m_locked_by_me(p2m));
+
+    /* This check is done with the p2m lock held.  This will make sure that
+     * even if d->is_dying changes under our feet, p2m_pod_empty_cache() 
+     * won't start until we're done. */
+    if ( unlikely(d->is_dying) )
+        goto out_fail;
+
+    /* Because PoD does not have cache list for 1GB pages, it has to remap
+     * 1GB region to 2MB chunks for a retry. */
+    if ( order == 18 )
+    {
+        gfn_aligned = (gfn >> order) << order;
+        /* Note that we are supposed to call set_p2m_entry() 512 times to 
+         * split 1GB into 512 2MB pages here. But We only do once here because
+         * set_p2m_entry() should automatically shatter the 1GB page into 
+         * 512 2MB pages. The rest of 511 calls are unnecessary.
+         */
+        set_p2m_entry(p2m, gfn_aligned, _mfn(POPULATE_ON_DEMAND_MFN), 9,
+                      p2m_populate_on_demand, p2m->default_access);
+        audit_p2m(p2m, 1);
+        p2m_unlock(p2m);
+        return 0;
+    }
+
+    /* Once we've ballooned down enough that we can fill the remaining
+     * PoD entries from the cache, don't sweep even if the particular
+     * list we want to use is empty: that can lead to thrashing zero pages 
+     * through the cache for no good reason.  */
+    if ( p2m->pod.entry_count > p2m->pod.count )
+    {
+
+        /* If we're low, start a sweep */
+        if ( order == 9 && page_list_empty(&p2m->pod.super) )
+            p2m_pod_emergency_sweep_super(p2m);
+
+        if ( page_list_empty(&p2m->pod.single) &&
+             ( ( order == 0 )
+               || (order == 9 && page_list_empty(&p2m->pod.super) ) ) )
+            p2m_pod_emergency_sweep(p2m);
+    }
+
+    /* Keep track of the highest gfn demand-populated by a guest fault */
+    if ( q == p2m_guest && gfn > p2m->pod.max_guest )
+        p2m->pod.max_guest = gfn;
+
+    spin_lock(&d->page_alloc_lock);
+
+    if ( p2m->pod.count == 0 )
+        goto out_of_memory;
+
+    /* Get a page f/ the cache.  A NULL return value indicates that the
+     * 2-meg range should be marked singleton PoD, and retried */
+    if ( (p = p2m_pod_cache_get(p2m, order)) == NULL )
+        goto remap_and_retry;
+
+    mfn = page_to_mfn(p);
+
+    BUG_ON((mfn_x(mfn) & ((1 << order)-1)) != 0);
+
+    spin_unlock(&d->page_alloc_lock);
+
+    gfn_aligned = (gfn >> order) << order;
+
+    set_p2m_entry(p2m, gfn_aligned, mfn, order, p2m_ram_rw, 
p2m->default_access);
+
+    for( i = 0; i < (1UL << order); i++ )
+    {
+        set_gpfn_from_mfn(mfn_x(mfn) + i, gfn_aligned + i);
+        paging_mark_dirty(d, mfn_x(mfn) + i);
+    }
+    
+    p2m->pod.entry_count -= (1 << order); /* Lock: p2m */
+    BUG_ON(p2m->pod.entry_count < 0);
+
+    if ( tb_init_done )
+    {
+        struct {
+            u64 gfn, mfn;
+            int d:16,order:16;
+        } t;
+
+        t.gfn = gfn;
+        t.mfn = mfn_x(mfn);
+        t.d = d->domain_id;
+        t.order = order;
+        
+        __trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t);
+    }
+
+    return 0;
+out_of_memory:
+    spin_unlock(&d->page_alloc_lock);
+
+    printk("%s: Out of populate-on-demand memory! tot_pages %" PRIu32 " 
pod_entries %" PRIi32 "\n",
+           __func__, d->tot_pages, p2m->pod.entry_count);
+    domain_crash(d);
+out_fail:
+    return -1;
+remap_and_retry:
+    BUG_ON(order != 9);
+    spin_unlock(&d->page_alloc_lock);
+
+    /* Remap this 2-meg region in singleton chunks */
+    gfn_aligned = (gfn>>order)<<order;
+    for(i=0; i<(1<<order); i++)
+        set_p2m_entry(p2m, gfn_aligned+i, _mfn(POPULATE_ON_DEMAND_MFN), 0,
+                      p2m_populate_on_demand, p2m->default_access);
+    if ( tb_init_done )
+    {
+        struct {
+            u64 gfn;
+            int d:16;
+        } t;
+
+        t.gfn = gfn;
+        t.d = d->domain_id;
+        
+        __trace_var(TRC_MEM_POD_SUPERPAGE_SPLINTER, 0, sizeof(t), &t);
+    }
+
+    return 0;
+}
+
+
+int
+guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
+                                      unsigned int order)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    unsigned long i;
+    p2m_type_t ot;
+    mfn_t omfn;
+    int pod_count = 0;
+    int rc = 0;
+
+    BUG_ON(!paging_mode_translate(d));
+
+    rc = p2m_gfn_check_limit(d, gfn, order);
+    if ( rc != 0 )
+        return rc;
+
+    p2m_lock(p2m);
+    audit_p2m(p2m, 1);
+
+    P2M_DEBUG("mark pod gfn=%#lx\n", gfn);
+
+    /* Make sure all gpfns are unused */
+    for ( i = 0; i < (1UL << order); i++ )
+    {
+        omfn = gfn_to_mfn_query(p2m, gfn + i, &ot);
+        if ( p2m_is_ram(ot) )
+        {
+            printk("%s: gfn_to_mfn returned type %d!\n",
+                   __func__, ot);
+            rc = -EBUSY;
+            goto out;
+        }
+        else if ( ot == p2m_populate_on_demand )
+        {
+            /* Count how man PoD entries we'll be replacing if successful */
+            pod_count++;
+        }
+    }
+
+    /* Now, actually do the two-way mapping */
+    if ( !set_p2m_entry(p2m, gfn, _mfn(POPULATE_ON_DEMAND_MFN), order,
+                        p2m_populate_on_demand, p2m->default_access) )
+        rc = -EINVAL;
+    else
+    {
+        p2m->pod.entry_count += 1 << order; /* Lock: p2m */
+        p2m->pod.entry_count -= pod_count;
+        BUG_ON(p2m->pod.entry_count < 0);
+    }
+
+    audit_p2m(p2m, 1);
+    p2m_unlock(p2m);
+
+out:
+    return rc;
+}
+
diff -r 4b0692880dfa -r 26c4beb6b520 xen/arch/x86/mm/p2m-pt.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/mm/p2m-pt.c  Fri May 06 11:15:35 2011 +0100
@@ -0,0 +1,1301 @@
+/******************************************************************************
+ * arch/x86/mm/p2m-pt.c
+ *
+ * Implementation of p2m datastructures as pagetables, for use by 
+ * NPT and shadow-pagetable code
+ *
+ * Parts of this code are Copyright (c) 2009-2011 by Citrix Systems, Inc.
+ * Parts of this code are Copyright (c) 2007 by Advanced Micro Devices.
+ * Parts of this code are Copyright (c) 2006-2007 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <asm/domain.h>
+#include <asm/page.h>
+#include <asm/paging.h>
+#include <asm/p2m.h>
+#include <xen/iommu.h>
+#include <asm/mem_event.h>
+#include <public/mem_event.h>
+#include <asm/mem_sharing.h>
+#include <xen/event.h>
+#include <xen/trace.h>
+#include <asm/hvm/nestedhvm.h>
+#include <asm/hvm/svm/amd-iommu-proto.h>
+
+/* Debugging and auditing of the P2M code? */
+#define P2M_AUDIT     0
+#define P2M_DEBUGGING 0
+
+/* Printouts */
+#define P2M_PRINTK(_f, _a...)                                \
+    debugtrace_printk("p2m: %s(): " _f, __func__, ##_a)
+#define P2M_ERROR(_f, _a...)                                 \
+    printk("pg error: %s(): " _f, __func__, ##_a)
+#if P2M_DEBUGGING
+#define P2M_DEBUG(_f, _a...)                                 \
+    debugtrace_printk("p2mdebug: %s(): " _f, __func__, ##_a)
+#else
+#define P2M_DEBUG(_f, _a...) do { (void)(_f); } while(0)
+#endif
+
+
+/* Override macros from asm/page.h to make them work with mfn_t */
+#undef mfn_to_page
+#define mfn_to_page(_m) __mfn_to_page(mfn_x(_m))
+#undef mfn_valid
+#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
+#undef page_to_mfn
+#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
+
+
+/* PTE flags for the various types of p2m entry */
+#define P2M_BASE_FLAGS \
+        (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED)
+
+#define SUPERPAGE_PAGES (1UL << 9)
+#define superpage_aligned(_x)  (((_x)&(SUPERPAGE_PAGES-1))==0)
+
+unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
+{
+    unsigned long flags;
+#ifdef __x86_64__
+    /*
+     * AMD IOMMU: When we share p2m table with iommu, bit 9 - bit 11 will be
+     * used for iommu hardware to encode next io page level. Bit 59 - bit 62
+     * are used for iommu flags, We could not use these bits to store p2m 
types.
+     */
+    flags = (unsigned long)(t & 0x7f) << 12;
+#else
+    flags = (t & 0x7UL) << 9;
+#endif
+#ifndef HAVE_GRANT_MAP_P2M
+    BUG_ON(p2m_is_grant(t));
+#endif
+    switch(t)
+    {
+    case p2m_invalid:
+    default:
+        return flags;
+    case p2m_ram_rw:
+    case p2m_grant_map_rw:
+        return flags | P2M_BASE_FLAGS | _PAGE_RW;
+    case p2m_ram_logdirty:
+        return flags | P2M_BASE_FLAGS;
+    case p2m_ram_ro:
+    case p2m_grant_map_ro:
+        return flags | P2M_BASE_FLAGS;
+    case p2m_ram_shared:
+        return flags | P2M_BASE_FLAGS;
+    case p2m_mmio_dm:
+        return flags;
+    case p2m_mmio_direct:
+        if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) )
+            flags |= _PAGE_RW;
+        return flags | P2M_BASE_FLAGS | _PAGE_PCD;
+    case p2m_populate_on_demand:
+        return flags;
+    }
+}
+
+#if P2M_AUDIT
+void audit_p2m(struct p2m_domain *p2m, int strict_m2p);
+#else
+# define audit_p2m(_p2m, _m2p) do { (void)(_p2m),(_m2p); } while (0)
+#endif /* P2M_AUDIT */
+
+// Find the next level's P2M entry, checking for out-of-range gfn's...
+// Returns NULL on error.
+//
+l1_pgentry_t *
+p2m_find_entry(void *table, unsigned long *gfn_remainder,
+                   unsigned long gfn, uint32_t shift, uint32_t max)
+{
+    u32 index;
+
+    index = *gfn_remainder >> shift;
+    if ( index >= max )
+    {
+        P2M_DEBUG("gfn=0x%lx out of range "
+                  "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
+                  gfn, *gfn_remainder, shift, index, max);
+        return NULL;
+    }
+    *gfn_remainder &= (1 << shift) - 1;
+    return (l1_pgentry_t *)table + index;
+}
+
+struct page_info *
+p2m_alloc_ptp(struct p2m_domain *p2m, unsigned long type)
+{
+    struct page_info *pg;
+
+    ASSERT(p2m);
+    ASSERT(p2m->domain);
+    ASSERT(p2m->domain->arch.paging.alloc_page);
+    pg = p2m->domain->arch.paging.alloc_page(p2m->domain);
+    if (pg == NULL)
+        return NULL;
+
+    page_list_add_tail(pg, &p2m->pages);
+    pg->u.inuse.type_info = type | 1 | PGT_validated;
+
+    return pg;
+}
+
+void
+p2m_free_ptp(struct p2m_domain *p2m, struct page_info *pg)
+{
+    ASSERT(pg);
+    ASSERT(p2m);
+    ASSERT(p2m->domain);
+    ASSERT(p2m->domain->arch.paging.free_page);
+
+    page_list_del(pg, &p2m->pages);
+    p2m->domain->arch.paging.free_page(p2m->domain, pg);
+
+    return;
+}
+
+/* Free intermediate tables from a p2m sub-tree */
+void
+p2m_free_entry(struct p2m_domain *p2m, l1_pgentry_t *p2m_entry, int page_order)
+{
+    /* End if the entry is a leaf entry. */
+    if ( page_order == 0
+         || !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT)
+         || (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+        return;
+
+    if ( page_order > 9 )
+    {
+        l1_pgentry_t *l3_table = map_domain_page(l1e_get_pfn(*p2m_entry));
+        for ( int i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
+            p2m_free_entry(p2m, l3_table + i, page_order - 9);
+        unmap_domain_page(l3_table);
+    }
+
+    p2m_free_ptp(p2m, mfn_to_page(_mfn(l1e_get_pfn(*p2m_entry))));
+}
+
+// Walk one level of the P2M table, allocating a new table if required.
+// Returns 0 on error.
+//
+
+/* AMD IOMMU: Convert next level bits and r/w bits into 24 bits p2m flags */
+#define iommu_nlevel_to_flags(nl, f) ((((nl) & 0x7) << 9 )|(((f) & 0x3) << 21))
+
+static void p2m_add_iommu_flags(l1_pgentry_t *p2m_entry,
+                                unsigned int nlevel, unsigned int flags)
+{
+#if CONFIG_PAGING_LEVELS == 4
+    if ( iommu_hap_pt_share )
+        l1e_add_flags(*p2m_entry, iommu_nlevel_to_flags(nlevel, flags));
+#endif
+}
+
+static int
+p2m_next_level(struct p2m_domain *p2m, mfn_t *table_mfn, void **table,
+               unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
+               u32 max, unsigned long type)
+{
+    l1_pgentry_t *l1_entry;
+    l1_pgentry_t *p2m_entry;
+    l1_pgentry_t new_entry;
+    void *next;
+    int i;
+
+    if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
+                                      shift, max)) )
+        return 0;
+
+    /* PoD: Not present doesn't imply empty. */
+    if ( !l1e_get_flags(*p2m_entry) )
+    {
+        struct page_info *pg;
+
+        pg = p2m_alloc_ptp(p2m, type);
+        if ( pg == NULL )
+            return 0;
+
+        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
+                                 __PAGE_HYPERVISOR | _PAGE_USER);
+
+        switch ( type ) {
+        case PGT_l3_page_table:
+            p2m_add_iommu_flags(&new_entry, 3, 
IOMMUF_readable|IOMMUF_writable);
+            p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 
4);
+            break;
+        case PGT_l2_page_table:
+#if CONFIG_PAGING_LEVELS == 3
+            /* for PAE mode, PDPE only has PCD/PWT/P bits available */
+            new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), _PAGE_PRESENT);
+#endif
+            p2m_add_iommu_flags(&new_entry, 2, 
IOMMUF_readable|IOMMUF_writable);
+            p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 
3);
+            break;
+        case PGT_l1_page_table:
+            p2m_add_iommu_flags(&new_entry, 1, 
IOMMUF_readable|IOMMUF_writable);
+            p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 
2);
+            break;
+        default:
+            BUG();
+            break;
+        }
+    }
+
+    ASSERT(l1e_get_flags(*p2m_entry) & (_PAGE_PRESENT|_PAGE_PSE));
+
+    /* split 1GB pages into 2MB pages */
+    if ( type == PGT_l2_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+    {
+        unsigned long flags, pfn;
+        struct page_info *pg;
+
+        pg = p2m_alloc_ptp(p2m, PGT_l2_page_table);
+        if ( pg == NULL )
+            return 0;
+
+        flags = l1e_get_flags(*p2m_entry);
+        pfn = l1e_get_pfn(*p2m_entry);
+
+        l1_entry = map_domain_page(mfn_x(page_to_mfn(pg)));
+        for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
+        {
+            new_entry = l1e_from_pfn(pfn + (i * L1_PAGETABLE_ENTRIES), flags);
+            p2m_add_iommu_flags(&new_entry, 1, 
IOMMUF_readable|IOMMUF_writable);
+            p2m->write_p2m_entry(p2m, gfn,
+                l1_entry+i, *table_mfn, new_entry, 2);
+        }
+        unmap_domain_page(l1_entry);
+        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
+                                 __PAGE_HYPERVISOR|_PAGE_USER); //disable PSE
+        p2m_add_iommu_flags(&new_entry, 2, IOMMUF_readable|IOMMUF_writable);
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3);
+    }
+
+
+    /* split single 2MB large page into 4KB page in P2M table */
+    if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+    {
+        unsigned long flags, pfn;
+        struct page_info *pg;
+
+        pg = p2m_alloc_ptp(p2m, PGT_l1_page_table);
+        if ( pg == NULL )
+            return 0;
+
+        /* New splintered mappings inherit the flags of the old superpage, 
+         * with a little reorganisation for the _PAGE_PSE_PAT bit. */
+        flags = l1e_get_flags(*p2m_entry);
+        pfn = l1e_get_pfn(*p2m_entry);
+        if ( pfn & 1 )           /* ==> _PAGE_PSE_PAT was set */
+            pfn -= 1;            /* Clear it; _PAGE_PSE becomes _PAGE_PAT */
+        else
+            flags &= ~_PAGE_PSE; /* Clear _PAGE_PSE (== _PAGE_PAT) */
+        
+        l1_entry = __map_domain_page(pg);
+        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+        {
+            new_entry = l1e_from_pfn(pfn + i, flags);
+            p2m_add_iommu_flags(&new_entry, 0, 0);
+            p2m->write_p2m_entry(p2m, gfn,
+                l1_entry+i, *table_mfn, new_entry, 1);
+        }
+        unmap_domain_page(l1_entry);
+        
+        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
+                                 __PAGE_HYPERVISOR|_PAGE_USER);
+        p2m_add_iommu_flags(&new_entry, 1, IOMMUF_readable|IOMMUF_writable);
+        p2m->write_p2m_entry(p2m, gfn,
+            p2m_entry, *table_mfn, new_entry, 2);
+    }
+
+    *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
+    next = map_domain_page(mfn_x(*table_mfn));
+    unmap_domain_page(*table);
+    *table = next;
+
+    return 1;
+}
+
+// Returns 0 on error (out of memory)
+static int
+p2m_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, 
+              unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma)
+{
+    // XXX -- this might be able to be faster iff current->domain == d
+    mfn_t table_mfn = pagetable_get_mfn(p2m_get_pagetable(p2m));
+    void *table =map_domain_page(mfn_x(table_mfn));
+    unsigned long i, gfn_remainder = gfn;
+    l1_pgentry_t *p2m_entry;
+    l1_pgentry_t entry_content;
+    l2_pgentry_t l2e_content;
+    l3_pgentry_t l3e_content;
+    int rv=0;
+    unsigned int iommu_pte_flags = (p2mt == p2m_ram_rw) ?
+                                   IOMMUF_readable|IOMMUF_writable:
+                                   0; 
+    unsigned long old_mfn = 0;
+
+    if ( tb_init_done )
+    {
+        struct {
+            u64 gfn, mfn;
+            int p2mt;
+            int d:16,order:16;
+        } t;
+
+        t.gfn = gfn;
+        t.mfn = mfn_x(mfn);
+        t.p2mt = p2mt;
+        t.d = p2m->domain->domain_id;
+        t.order = page_order;
+
+        __trace_var(TRC_MEM_SET_P2M_ENTRY, 0, sizeof(t), &t);
+    }
+
+#if CONFIG_PAGING_LEVELS >= 4
+    if ( !p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn,
+                         L4_PAGETABLE_SHIFT - PAGE_SHIFT,
+                         L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
+        goto out;
+#endif
+    /*
+     * Try to allocate 1GB page table if this feature is supported.
+     */
+    if ( page_order == 18 )
+    {
+        l1_pgentry_t old_entry = l1e_empty();
+        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+                                   L3_PAGETABLE_SHIFT - PAGE_SHIFT,
+                                   L3_PAGETABLE_ENTRIES);
+        ASSERT(p2m_entry);
+        if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
+             !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+        {
+            /* We're replacing a non-SP page with a superpage.  Make sure to
+             * handle freeing the table properly. */
+            old_entry = *p2m_entry;
+        }
+
+        ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
+        l3e_content = mfn_valid(mfn) 
+            ? l3e_from_pfn(mfn_x(mfn),
+                           p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE)
+            : l3e_empty();
+        entry_content.l1 = l3e_content.l3;
+
+        if ( entry_content.l1 != 0 )
+        {
+            p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
+            old_mfn = l1e_get_pfn(*p2m_entry);
+        }
+
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 3);
+        /* NB: paging_write_p2m_entry() handles tlb flushes properly */
+
+        /* Free old intermediate tables if necessary */
+        if ( l1e_get_flags(old_entry) & _PAGE_PRESENT )
+            p2m_free_entry(p2m, &old_entry, page_order);
+    }
+    /*
+     * When using PAE Xen, we only allow 33 bits of pseudo-physical
+     * address in translated guests (i.e. 8 GBytes).  This restriction
+     * comes from wanting to map the P2M table into the 16MB RO_MPT hole
+     * in Xen's address space for translated PV guests.
+     * When using AMD's NPT on PAE Xen, we are restricted to 4GB.
+     */
+    else if ( !p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn,
+                              L3_PAGETABLE_SHIFT - PAGE_SHIFT,
+                              ((CONFIG_PAGING_LEVELS == 3)
+                               ? (hap_enabled(p2m->domain) ? 4 : 8)
+                               : L3_PAGETABLE_ENTRIES),
+                              PGT_l2_page_table) )
+        goto out;
+
+    if ( page_order == 0 )
+    {
+        if ( !p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn,
+                             L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+                             L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
+            goto out;
+
+        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+                                   0, L1_PAGETABLE_ENTRIES);
+        ASSERT(p2m_entry);
+        
+        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
+            entry_content = l1e_from_pfn(mfn_x(mfn),
+                                         p2m_type_to_flags(p2mt, mfn));
+        else
+            entry_content = l1e_empty();
+
+        if ( entry_content.l1 != 0 )
+        {
+            p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
+            old_mfn = l1e_get_pfn(*p2m_entry);
+        }
+        /* level 1 entry */
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 1);
+        /* NB: paging_write_p2m_entry() handles tlb flushes properly */
+    }
+    else if ( page_order == 9 )
+    {
+        l1_pgentry_t old_entry = l1e_empty();
+        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+                                   L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+                                   L2_PAGETABLE_ENTRIES);
+        ASSERT(p2m_entry);
+        
+        /* FIXME: Deal with 4k replaced by 2meg pages */
+        if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
+             !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+        {
+            /* We're replacing a non-SP page with a superpage.  Make sure to
+             * handle freeing the table properly. */
+            old_entry = *p2m_entry;
+        }
+        
+        ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
+        if ( mfn_valid(mfn) || p2m_is_magic(p2mt) )
+            l2e_content = l2e_from_pfn(mfn_x(mfn),
+                                       p2m_type_to_flags(p2mt, mfn) |
+                                       _PAGE_PSE);
+        else
+            l2e_content = l2e_empty();
+        
+        entry_content.l1 = l2e_content.l2;
+
+        if ( entry_content.l1 != 0 )
+        {
+            p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
+            old_mfn = l1e_get_pfn(*p2m_entry);
+        }
+
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 2);
+        /* NB: paging_write_p2m_entry() handles tlb flushes properly */
+
+        /* Free old intermediate tables if necessary */
+        if ( l1e_get_flags(old_entry) & _PAGE_PRESENT )
+            p2m_free_entry(p2m, &old_entry, page_order);
+    }
+
+    /* Track the highest gfn for which we have ever had a valid mapping */
+    if ( mfn_valid(mfn) 
+         && (gfn + (1UL << page_order) - 1 > p2m->max_mapped_pfn) )
+        p2m->max_mapped_pfn = gfn + (1UL << page_order) - 1;
+
+    if ( iommu_enabled && need_iommu(p2m->domain) )
+    {
+        if ( iommu_hap_pt_share )
+        {
+            if ( old_mfn && (old_mfn != mfn_x(mfn)) )
+                amd_iommu_flush_pages(p2m->domain, gfn, page_order);
+        }
+        else
+        {
+            if ( p2mt == p2m_ram_rw )
+                for ( i = 0; i < (1UL << page_order); i++ )
+                    iommu_map_page(p2m->domain, gfn+i, mfn_x(mfn)+i,
+                                   IOMMUF_readable|IOMMUF_writable);
+            else
+                for ( int i = 0; i < (1UL << page_order); i++ )
+                    iommu_unmap_page(p2m->domain, gfn+i);
+        }
+    }
+
+    /* Success */
+    rv = 1;
+
+out:
+    unmap_domain_page(table);
+    return rv;
+}
+
+
+/* Non-ept "lock-and-check" wrapper */
+static int p2m_pod_check_and_populate(struct p2m_domain *p2m, unsigned long 
gfn,
+                                      l1_pgentry_t *p2m_entry, int order,
+                                      p2m_query_t q)
+{
+    /* Only take the lock if we don't already have it.  Otherwise it
+     * wouldn't be safe to do p2m lookups with the p2m lock held */
+    int do_locking = !p2m_locked_by_me(p2m);
+    int r;
+
+    if ( do_locking )
+        p2m_lock(p2m);
+
+    audit_p2m(p2m, 1);
+
+    /* Check to make sure this is still PoD */
+    if ( p2m_flags_to_type(l1e_get_flags(*p2m_entry)) != 
p2m_populate_on_demand )
+    {
+        if ( do_locking )
+            p2m_unlock(p2m);
+        return 0;
+    }
+
+    r = p2m_pod_demand_populate(p2m, gfn, order, q);
+
+    audit_p2m(p2m, 1);
+    if ( do_locking )
+        p2m_unlock(p2m);
+
+    return r;
+}
+
+
+static mfn_t
+p2m_gfn_to_mfn(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, 
p2m_access_t *a,
+               p2m_query_t q)
+{
+    mfn_t mfn;
+    paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
+    l2_pgentry_t *l2e;
+    l1_pgentry_t *l1e;
+
+    ASSERT(paging_mode_translate(p2m->domain));
+
+    /* XXX This is for compatibility with the old model, where anything not 
+     * XXX marked as RAM was considered to be emulated MMIO space.
+     * XXX Once we start explicitly registering MMIO regions in the p2m 
+     * XXX we will return p2m_invalid for unmapped gfns */
+    *t = p2m_mmio_dm;
+    /* Not implemented except with EPT */
+    *a = p2m_access_rwx; 
+
+    mfn = pagetable_get_mfn(p2m_get_pagetable(p2m));
+
+    if ( gfn > p2m->max_mapped_pfn )
+        /* This pfn is higher than the highest the p2m map currently holds */
+        return _mfn(INVALID_MFN);
+
+#if CONFIG_PAGING_LEVELS >= 4
+    {
+        l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn));
+        l4e += l4_table_offset(addr);
+        if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
+        {
+            unmap_domain_page(l4e);
+            return _mfn(INVALID_MFN);
+        }
+        mfn = _mfn(l4e_get_pfn(*l4e));
+        unmap_domain_page(l4e);
+    }
+#endif
+    {
+        l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn));
+#if CONFIG_PAGING_LEVELS == 3
+        /* On PAE hosts the p2m has eight l3 entries, not four (see
+         * shadow_set_p2m_entry()) so we can't use l3_table_offset.
+         * Instead, just count the number of l3es from zero.  It's safe
+         * to do this because we already checked that the gfn is within
+         * the bounds of the p2m. */
+        l3e += (addr >> L3_PAGETABLE_SHIFT);
+#else
+        l3e += l3_table_offset(addr);
+#endif
+pod_retry_l3:
+        if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
+        {
+            if ( p2m_flags_to_type(l3e_get_flags(*l3e)) == 
p2m_populate_on_demand )
+            {
+                if ( q != p2m_query )
+                {
+                    if ( !p2m_pod_demand_populate(p2m, gfn, 18, q) )
+                        goto pod_retry_l3;
+                }
+                else
+                    *t = p2m_populate_on_demand;
+            }
+            unmap_domain_page(l3e);
+            return _mfn(INVALID_MFN);
+        }
+        else if ( (l3e_get_flags(*l3e) & _PAGE_PSE) )
+        {
+            mfn = _mfn(l3e_get_pfn(*l3e) +
+                       l2_table_offset(addr) * L1_PAGETABLE_ENTRIES +
+                       l1_table_offset(addr));
+            *t = p2m_flags_to_type(l3e_get_flags(*l3e));
+            unmap_domain_page(l3e);
+
+            ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
+            return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
+        }
+
+        mfn = _mfn(l3e_get_pfn(*l3e));
+        unmap_domain_page(l3e);
+    }
+
+    l2e = map_domain_page(mfn_x(mfn));
+    l2e += l2_table_offset(addr);
+
+pod_retry_l2:
+    if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
+    {
+        /* PoD: Try to populate a 2-meg chunk */
+        if ( p2m_flags_to_type(l2e_get_flags(*l2e)) == p2m_populate_on_demand )
+        {
+            if ( q != p2m_query ) {
+                if ( !p2m_pod_check_and_populate(p2m, gfn,
+                                                 (l1_pgentry_t *)l2e, 9, q) )
+                    goto pod_retry_l2;
+            } else
+                *t = p2m_populate_on_demand;
+        }
+    
+        unmap_domain_page(l2e);
+        return _mfn(INVALID_MFN);
+    }
+    else if ( (l2e_get_flags(*l2e) & _PAGE_PSE) )
+    {
+        mfn = _mfn(l2e_get_pfn(*l2e) + l1_table_offset(addr));
+        *t = p2m_flags_to_type(l2e_get_flags(*l2e));
+        unmap_domain_page(l2e);
+        
+        ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
+        return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
+    }
+
+    mfn = _mfn(l2e_get_pfn(*l2e));
+    unmap_domain_page(l2e);
+
+    l1e = map_domain_page(mfn_x(mfn));
+    l1e += l1_table_offset(addr);
+pod_retry_l1:
+    if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
+    {
+        /* PoD: Try to populate */
+        if ( p2m_flags_to_type(l1e_get_flags(*l1e)) == p2m_populate_on_demand )
+        {
+            if ( q != p2m_query ) {
+                if ( !p2m_pod_check_and_populate(p2m, gfn,
+                                                 (l1_pgentry_t *)l1e, 0, q) )
+                    goto pod_retry_l1;
+            } else
+                *t = p2m_populate_on_demand;
+        }
+    
+        unmap_domain_page(l1e);
+        return _mfn(INVALID_MFN);
+    }
+    mfn = _mfn(l1e_get_pfn(*l1e));
+    *t = p2m_flags_to_type(l1e_get_flags(*l1e));
+    unmap_domain_page(l1e);
+
+    ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
+    return (p2m_is_valid(*t) || p2m_is_grant(*t)) ? mfn : _mfn(INVALID_MFN);
+}
+
+/* Read the current domain's p2m table (through the linear mapping). */
+static mfn_t p2m_gfn_to_mfn_current(struct p2m_domain *p2m,
+                                    unsigned long gfn, p2m_type_t *t, 
p2m_access_t *a,
+                                    p2m_query_t q)
+{
+    mfn_t mfn = _mfn(INVALID_MFN);
+    p2m_type_t p2mt = p2m_mmio_dm;
+    paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
+    /* XXX This is for compatibility with the old model, where anything not 
+     * XXX marked as RAM was considered to be emulated MMIO space.
+     * XXX Once we start explicitly registering MMIO regions in the p2m 
+     * XXX we will return p2m_invalid for unmapped gfns */
+
+    /* Not currently implemented except for EPT */
+    *a = p2m_access_rwx;
+
+    if ( gfn <= p2m->max_mapped_pfn )
+    {
+        l1_pgentry_t l1e = l1e_empty(), *p2m_entry;
+        l2_pgentry_t l2e = l2e_empty();
+        int ret;
+#if CONFIG_PAGING_LEVELS >= 4
+        l3_pgentry_t l3e = l3e_empty();
+#endif
+
+        ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START) 
+               / sizeof(l1_pgentry_t));
+
+#if CONFIG_PAGING_LEVELS >= 4
+        /*
+         * Read & process L3
+         */
+        p2m_entry = (l1_pgentry_t *)
+            &__linear_l2_table[l2_linear_offset(RO_MPT_VIRT_START)
+                               + l3_linear_offset(addr)];
+    pod_retry_l3:
+        ret = __copy_from_user(&l3e, p2m_entry, sizeof(l3e));
+
+        if ( ret != 0 || !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
+        {
+            if ( (l3e_get_flags(l3e) & _PAGE_PSE) &&
+                 (p2m_flags_to_type(l3e_get_flags(l3e)) == 
p2m_populate_on_demand) )
+            {
+                /* The read has succeeded, so we know that mapping exists */
+                if ( q != p2m_query )
+                {
+                    if ( !p2m_pod_demand_populate(p2m, gfn, 18, q) )
+                        goto pod_retry_l3;
+                    p2mt = p2m_invalid;
+                    printk("%s: Allocate 1GB failed!\n", __func__);
+                    goto out;
+                }
+                else
+                {
+                    p2mt = p2m_populate_on_demand;
+                    goto out;
+                }
+            }
+            goto pod_retry_l2;
+        }
+
+        if ( l3e_get_flags(l3e) & _PAGE_PSE )
+        {
+            p2mt = p2m_flags_to_type(l3e_get_flags(l3e));
+            ASSERT(l3e_get_pfn(l3e) != INVALID_MFN || !p2m_is_ram(p2mt));
+            if (p2m_is_valid(p2mt) )
+                mfn = _mfn(l3e_get_pfn(l3e) + 
+                           l2_table_offset(addr) * L1_PAGETABLE_ENTRIES + 
+                           l1_table_offset(addr));
+            else
+                p2mt = p2m_mmio_dm;
+            
+            goto out;
+        }
+#endif
+        /*
+         * Read & process L2
+         */
+        p2m_entry = &__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START)
+                                       + l2_linear_offset(addr)];
+
+    pod_retry_l2:
+        ret = __copy_from_user(&l2e,
+                               p2m_entry,
+                               sizeof(l2e));
+        if ( ret != 0
+             || !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
+        {
+            if( (l2e_get_flags(l2e) & _PAGE_PSE)
+                && ( p2m_flags_to_type(l2e_get_flags(l2e))
+                     == p2m_populate_on_demand ) )
+            {
+                /* The read has succeeded, so we know that the mapping
+                 * exits at this point.  */
+                if ( q != p2m_query )
+                {
+                    if ( !p2m_pod_check_and_populate(p2m, gfn,
+                                                     p2m_entry, 9, q) )
+                        goto pod_retry_l2;
+
+                    /* Allocate failed. */
+                    p2mt = p2m_invalid;
+                    printk("%s: Allocate failed!\n", __func__);
+                    goto out;
+                }
+                else
+                {
+                    p2mt = p2m_populate_on_demand;
+                    goto out;
+                }
+            }
+
+            goto pod_retry_l1;
+        }
+        
+        if (l2e_get_flags(l2e) & _PAGE_PSE)
+        {
+            p2mt = p2m_flags_to_type(l2e_get_flags(l2e));
+            ASSERT(l2e_get_pfn(l2e) != INVALID_MFN || !p2m_is_ram(p2mt));
+
+            if ( p2m_is_valid(p2mt) )
+                mfn = _mfn(l2e_get_pfn(l2e) + l1_table_offset(addr));
+            else
+                p2mt = p2m_mmio_dm;
+
+            goto out;
+        }
+
+        /*
+         * Read and process L1
+         */
+
+        /* Need to __copy_from_user because the p2m is sparse and this
+         * part might not exist */
+    pod_retry_l1:
+        p2m_entry = &phys_to_machine_mapping[gfn];
+
+        ret = __copy_from_user(&l1e,
+                               p2m_entry,
+                               sizeof(l1e));
+            
+        if ( ret == 0 ) {
+            p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
+            ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
+
+            if ( p2m_flags_to_type(l1e_get_flags(l1e))
+                 == p2m_populate_on_demand )
+            {
+                /* The read has succeeded, so we know that the mapping
+                 * exits at this point.  */
+                if ( q != p2m_query )
+                {
+                    if ( !p2m_pod_check_and_populate(p2m, gfn,
+                                                     (l1_pgentry_t 
*)p2m_entry, 0, q) )
+                        goto pod_retry_l1;
+
+                    /* Allocate failed. */
+                    p2mt = p2m_invalid;
+                    goto out;
+                }
+                else
+                {
+                    p2mt = p2m_populate_on_demand;
+                    goto out;
+                }
+            }
+
+            if ( p2m_is_valid(p2mt) || p2m_is_grant(p2mt) )
+                mfn = _mfn(l1e_get_pfn(l1e));
+            else 
+                /* XXX see above */
+                p2mt = p2m_mmio_dm;
+        }
+    }
+out:
+    *t = p2mt;
+    return mfn;
+}
+
+/* Walk the whole p2m table, changing any entries of the old type
+ * to the new type.  This is used in hardware-assisted paging to 
+ * quickly enable or diable log-dirty tracking */
+void p2m_change_type_global(struct p2m_domain *p2m, p2m_type_t ot, p2m_type_t 
nt)
+{
+    unsigned long mfn, gfn, flags;
+    l1_pgentry_t l1e_content;
+    l1_pgentry_t *l1e;
+    l2_pgentry_t *l2e;
+    mfn_t l1mfn, l2mfn, l3mfn;
+    unsigned long i1, i2, i3;
+    l3_pgentry_t *l3e;
+#if CONFIG_PAGING_LEVELS == 4
+    l4_pgentry_t *l4e;
+    unsigned long i4;
+#endif /* CONFIG_PAGING_LEVELS == 4 */
+
+    BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
+    BUG_ON(ot != nt && (ot == p2m_mmio_direct || nt == p2m_mmio_direct));
+
+    if ( !paging_mode_translate(p2m->domain) )
+        return;
+
+    if ( pagetable_get_pfn(p2m_get_pagetable(p2m)) == 0 )
+        return;
+
+    ASSERT(p2m_locked_by_me(p2m));
+
+#if CONFIG_PAGING_LEVELS == 4
+    l4e = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+#else /* CONFIG_PAGING_LEVELS == 3 */
+    l3mfn = _mfn(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+    l3e = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 4
+    for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
+    {
+        if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
+        {
+            continue;
+        }
+        l3mfn = _mfn(l4e_get_pfn(l4e[i4]));
+        l3e = map_domain_page(l4e_get_pfn(l4e[i4]));
+#endif
+        for ( i3 = 0;
+              i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8);
+              i3++ )
+        {
+            if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
+            {
+                continue;
+            }
+            if ( (l3e_get_flags(l3e[i3]) & _PAGE_PSE) )
+            {
+                flags = l3e_get_flags(l3e[i3]);
+                if ( p2m_flags_to_type(flags) != ot )
+                    continue;
+                mfn = l3e_get_pfn(l3e[i3]);
+                gfn = get_gpfn_from_mfn(mfn);
+                flags = p2m_type_to_flags(nt, _mfn(mfn));
+                l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
+                p2m->write_p2m_entry(p2m, gfn,
+                                     (l1_pgentry_t *)&l3e[i3],
+                                     l3mfn, l1e_content, 3);
+                continue;
+            }
+
+            l2mfn = _mfn(l3e_get_pfn(l3e[i3]));
+            l2e = map_domain_page(l3e_get_pfn(l3e[i3]));
+            for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
+            {
+                if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
+                {
+                    continue;
+                }
+
+                if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE) )
+                {
+                    flags = l2e_get_flags(l2e[i2]);
+                    if ( p2m_flags_to_type(flags) != ot )
+                        continue;
+                    mfn = l2e_get_pfn(l2e[i2]);
+                    /* Do not use get_gpfn_from_mfn because it may return 
+                       SHARED_M2P_ENTRY */
+                    gfn = (i2 + (i3
+#if CONFIG_PAGING_LEVELS >= 4
+                                  + (i4 * L3_PAGETABLE_ENTRIES)
+#endif
+                               )
+                           * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; 
+                    flags = p2m_type_to_flags(nt, _mfn(mfn));
+                    l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
+                    p2m->write_p2m_entry(p2m, gfn,
+                                         (l1_pgentry_t *)&l2e[i2],
+                                         l2mfn, l1e_content, 2);
+                    continue;
+                }
+
+                l1mfn = _mfn(l2e_get_pfn(l2e[i2]));
+                l1e = map_domain_page(mfn_x(l1mfn));
+
+                for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
+                {
+                    flags = l1e_get_flags(l1e[i1]);
+                    if ( p2m_flags_to_type(flags) != ot )
+                        continue;
+                    mfn = l1e_get_pfn(l1e[i1]);
+                    gfn = i1 + (i2 + (i3
+#if CONFIG_PAGING_LEVELS >= 4
+                                       + (i4 * L3_PAGETABLE_ENTRIES)
+#endif
+                                    )
+                           * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; 
+                    /* create a new 1le entry with the new type */
+                    flags = p2m_type_to_flags(nt, _mfn(mfn));
+                    l1e_content = l1e_from_pfn(mfn, flags);
+                    p2m->write_p2m_entry(p2m, gfn, &l1e[i1],
+                                         l1mfn, l1e_content, 1);
+                }
+                unmap_domain_page(l1e);
+            }
+            unmap_domain_page(l2e);
+        }
+#if CONFIG_PAGING_LEVELS >= 4
+        unmap_domain_page(l3e);
+    }
+#endif
+
+#if CONFIG_PAGING_LEVELS == 4
+    unmap_domain_page(l4e);
+#else /* CONFIG_PAGING_LEVELS == 3 */
+    unmap_domain_page(l3e);
+#endif
+
+}
+
+/* Set up the p2m function pointers for pagetable format */
+void p2m_pt_init(struct p2m_domain *p2m)
+{
+    p2m->set_entry = p2m_set_entry;
+    p2m->get_entry = p2m_gfn_to_mfn;
+    p2m->get_entry_current = p2m_gfn_to_mfn_current;
+    p2m->change_entry_type_global = p2m_change_type_global;
+    p2m->write_p2m_entry = paging_write_p2m_entry;
+}
+
+
+#if P2M_AUDIT
+/* strict_m2p == 0 allows m2p mappings that don'#t match the p2m. 
+ * It's intended for add_to_physmap, when the domain has just been allocated 
+ * new mfns that might have stale m2p entries from previous owners */
+void audit_p2m(struct p2m_domain *p2m, int strict_m2p)
+{
+    struct page_info *page;
+    struct domain *od;
+    unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
+    int entry_count = 0;
+    mfn_t p2mfn;
+    unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
+    int test_linear;
+    p2m_type_t type;
+    struct domain *d = p2m->domain;
+
+    if ( !paging_mode_translate(d) )
+        return;
+
+    //P2M_PRINTK("p2m audit starts\n");
+
+    test_linear = ( (d == current->domain)
+                    && !pagetable_is_null(current->arch.monitor_table) );
+    if ( test_linear )
+        flush_tlb_local();
+
+    spin_lock(&d->page_alloc_lock);
+
+    /* Audit part one: walk the domain's page allocation list, checking
+     * the m2p entries. */
+    page_list_for_each ( page, &d->page_list )
+    {
+        mfn = mfn_x(page_to_mfn(page));
+
+        // P2M_PRINTK("auditing guest page, mfn=%#lx\n", mfn);
+
+        od = page_get_owner(page);
+
+        if ( od != d )
+        {
+            P2M_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n",
+                       mfn, od, (od?od->domain_id:-1), d, d->domain_id);
+            continue;
+        }
+
+        gfn = get_gpfn_from_mfn(mfn);
+        if ( gfn == INVALID_M2P_ENTRY )
+        {
+            orphans_i++;
+            //P2M_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n",
+            //               mfn);
+            continue;
+        }
+
+        if ( gfn == 0x55555555 || gfn == 0x5555555555555555 )
+        {
+            orphans_d++;
+            //P2M_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n",
+            //               mfn);
+            continue;
+        }
+
+        if ( gfn == SHARED_M2P_ENTRY )
+        {
+            P2M_PRINTK("shared mfn (%lx) on domain page list!\n",
+                    mfn);
+            continue;
+        }
+
+        p2mfn = gfn_to_mfn_type_p2m(p2m, gfn, &type, p2m_query);
+        if ( strict_m2p && mfn_x(p2mfn) != mfn )
+        {
+            mpbad++;
+            P2M_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx"
+                       " (-> gfn %#lx)\n",
+                       mfn, gfn, mfn_x(p2mfn),
+                       (mfn_valid(p2mfn)
+                        ? get_gpfn_from_mfn(mfn_x(p2mfn))
+                        : -1u));
+            /* This m2p entry is stale: the domain has another frame in
+             * this physical slot.  No great disaster, but for neatness,
+             * blow away the m2p entry. */
+            set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
+        }
+
+        if ( test_linear && (gfn <= p2m->max_mapped_pfn) )
+        {
+            lp2mfn = mfn_x(gfn_to_mfn_query(p2m, gfn, &type));
+            if ( lp2mfn != mfn_x(p2mfn) )
+            {
+                P2M_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
+                           "(!= mfn %#lx)\n", gfn, lp2mfn, mfn_x(p2mfn));
+            }
+        }
+
+        // P2M_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n",
+        //                mfn, gfn, mfn_x(p2mfn), lp2mfn);
+    }
+
+    spin_unlock(&d->page_alloc_lock);
+
+    /* Audit part two: walk the domain's p2m table, checking the entries. */
+    if ( pagetable_get_pfn(p2m_get_pagetable(p2m)) != 0 )
+    {
+        l2_pgentry_t *l2e;
+        l1_pgentry_t *l1e;
+        int i1, i2;
+
+#if CONFIG_PAGING_LEVELS == 4
+        l4_pgentry_t *l4e;
+        l3_pgentry_t *l3e;
+        int i4, i3;
+        l4e = 
map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+#else /* CONFIG_PAGING_LEVELS == 3 */
+        l3_pgentry_t *l3e;
+        int i3;
+        l3e = 
map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+#endif
+
+        gfn = 0;
+#if CONFIG_PAGING_LEVELS >= 4
+        for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
+        {
+            if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
+            {
+                gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
+                continue;
+            }
+            l3e = map_domain_page(mfn_x(_mfn(l4e_get_pfn(l4e[i4]))));
+#endif
+            for ( i3 = 0;
+                  i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8);
+                  i3++ )
+            {
+                if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
+                {
+                    gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
+                    continue;
+                }
+
+                /* check for 1GB super page */
+                if ( l3e_get_flags(l3e[i3]) & _PAGE_PSE )
+                {
+                    mfn = l3e_get_pfn(l3e[i3]);
+                    ASSERT(mfn_valid(_mfn(mfn)));
+                    /* we have to cover 512x512 4K pages */
+                    for ( i2 = 0; 
+                          i2 < (L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES);
+                          i2++)
+                    {
+                        m2pfn = get_gpfn_from_mfn(mfn+i2);
+                        if ( m2pfn != (gfn + i2) )
+                        {
+                            pmbad++;
+                            P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
+                                       " -> gfn %#lx\n", gfn+i2, mfn+i2,
+                                       m2pfn);
+                            BUG();
+                        }
+                        gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
+                        continue;
+                    }
+                }
+
+                l2e = map_domain_page(mfn_x(_mfn(l3e_get_pfn(l3e[i3]))));
+                for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
+                {
+                    if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
+                    {
+                        if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE)
+                             && ( p2m_flags_to_type(l2e_get_flags(l2e[i2]))
+                                  == p2m_populate_on_demand ) )
+                            entry_count+=SUPERPAGE_PAGES;
+                        gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
+                        continue;
+                    }
+                    
+                    /* check for super page */
+                    if ( l2e_get_flags(l2e[i2]) & _PAGE_PSE )
+                    {
+                        mfn = l2e_get_pfn(l2e[i2]);
+                        ASSERT(mfn_valid(_mfn(mfn)));
+                        for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++)
+                        {
+                            m2pfn = get_gpfn_from_mfn(mfn+i1);
+                            /* Allow shared M2Ps */
+                            if ( (m2pfn != (gfn + i1)) &&
+                                 (m2pfn != SHARED_M2P_ENTRY) )
+                            {
+                                pmbad++;
+                                P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
+                                           " -> gfn %#lx\n", gfn+i1, mfn+i1,
+                                           m2pfn);
+                                BUG();
+                            }
+                        }
+                        gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
+                        continue;
+                    }
+
+                    l1e = map_domain_page(mfn_x(_mfn(l2e_get_pfn(l2e[i2]))));
+
+                    for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
+                    {
+                        p2m_type_t type;
+
+                        type = p2m_flags_to_type(l1e_get_flags(l1e[i1]));
+                        if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
+                        {
+                            if ( type == p2m_populate_on_demand )
+                                entry_count++;
+                            continue;
+                        }
+                        mfn = l1e_get_pfn(l1e[i1]);
+                        ASSERT(mfn_valid(_mfn(mfn)));
+                        m2pfn = get_gpfn_from_mfn(mfn);
+                        if ( m2pfn != gfn &&
+                             type != p2m_mmio_direct &&
+                             !p2m_is_grant(type) &&
+                             !p2m_is_shared(type) )
+                        {
+                            pmbad++;
+                            printk("mismatch: gfn %#lx -> mfn %#lx"
+                                   " -> gfn %#lx\n", gfn, mfn, m2pfn);
+                            P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
+                                       " -> gfn %#lx\n", gfn, mfn, m2pfn);
+                            BUG();
+                        }
+                    }
+                    unmap_domain_page(l1e);
+                }
+                unmap_domain_page(l2e);
+            }
+#if CONFIG_PAGING_LEVELS >= 4
+            unmap_domain_page(l3e);
+        }
+#endif
+
+#if CONFIG_PAGING_LEVELS == 4
+        unmap_domain_page(l4e);
+#else /* CONFIG_PAGING_LEVELS == 3 */
+        unmap_domain_page(l3e);
+#endif
+
+    }
+
+    if ( entry_count != p2m->pod.entry_count )
+    {
+        printk("%s: refcounted entry count %d, audit count %d!\n",
+               __func__,
+               p2m->pod.entry_count,
+               entry_count);
+        BUG();
+    }
+        
+    //P2M_PRINTK("p2m audit complete\n");
+    //if ( orphans_i | orphans_d | mpbad | pmbad )
+    //    P2M_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
+    //                   orphans_i + orphans_d, orphans_i, orphans_d);
+    if ( mpbad | pmbad )
+    {
+        P2M_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
+                   pmbad, mpbad);
+        WARN();
+    }
+}
+#endif /* P2M_AUDIT */
+
diff -r 4b0692880dfa -r 26c4beb6b520 xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c     Thu May 05 17:40:34 2011 +0100
+++ b/xen/arch/x86/mm/p2m.c     Fri May 06 11:15:35 2011 +0100
@@ -37,10 +37,6 @@
 #include <asm/hvm/nestedhvm.h>
 #include <asm/hvm/svm/amd-iommu-proto.h>
 
-/* Debugging and auditing of the P2M code? */
-#define P2M_AUDIT     0
-#define P2M_DEBUGGING 0
-
 /* turn on/off 1GB host page table support for hap, default on */
 static bool_t __read_mostly opt_hap_1gb = 1;
 boolean_param("hap_1gb", opt_hap_1gb);
@@ -69,1853 +65,14 @@ boolean_param("hap_2mb", opt_hap_2mb);
 #undef page_to_mfn
 #define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
 
-
-/* PTE flags for the various types of p2m entry */
-#define P2M_BASE_FLAGS \
-        (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED)
-
-#define SUPERPAGE_PAGES (1UL << 9)
-#define superpage_aligned(_x)  (((_x)&(SUPERPAGE_PAGES-1))==0)
-
-unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
-{
-    unsigned long flags;
-#ifdef __x86_64__
-    /*
-     * AMD IOMMU: When we share p2m table with iommu, bit 9 - bit 11 will be
-     * used for iommu hardware to encode next io page level. Bit 59 - bit 62
-     * are used for iommu flags, We could not use these bits to store p2m 
types.
-     */
-    flags = (unsigned long)(t & 0x7f) << 12;
-#else
-    flags = (t & 0x7UL) << 9;
-#endif
-#ifndef HAVE_GRANT_MAP_P2M
-    BUG_ON(p2m_is_grant(t));
-#endif
-    switch(t)
-    {
-    case p2m_invalid:
-    default:
-        return flags;
-    case p2m_ram_rw:
-    case p2m_grant_map_rw:
-        return flags | P2M_BASE_FLAGS | _PAGE_RW;
-    case p2m_ram_logdirty:
-        return flags | P2M_BASE_FLAGS;
-    case p2m_ram_ro:
-    case p2m_grant_map_ro:
-        return flags | P2M_BASE_FLAGS;
-    case p2m_ram_shared:
-        return flags | P2M_BASE_FLAGS;
-    case p2m_mmio_dm:
-        return flags;
-    case p2m_mmio_direct:
-        if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) )
-            flags |= _PAGE_RW;
-        return flags | P2M_BASE_FLAGS | _PAGE_PCD;
-    case p2m_populate_on_demand:
-        return flags;
-    }
-}
-
 #if P2M_AUDIT
-static void audit_p2m(struct p2m_domain *p2m, int strict_m2p);
+extern void audit_p2m(struct p2m_domain *p2m, int strict_m2p);
 #else
 # define audit_p2m(_p2m, _m2p) do { (void)(_p2m),(_m2p); } while (0)
 #endif /* P2M_AUDIT */
 
-// Find the next level's P2M entry, checking for out-of-range gfn's...
-// Returns NULL on error.
-//
-l1_pgentry_t *
-p2m_find_entry(void *table, unsigned long *gfn_remainder,
-                   unsigned long gfn, uint32_t shift, uint32_t max)
-{
-    u32 index;
-
-    index = *gfn_remainder >> shift;
-    if ( index >= max )
-    {
-        P2M_DEBUG("gfn=0x%lx out of range "
-                  "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
-                  gfn, *gfn_remainder, shift, index, max);
-        return NULL;
-    }
-    *gfn_remainder &= (1 << shift) - 1;
-    return (l1_pgentry_t *)table + index;
-}
-
-struct page_info *
-p2m_alloc_ptp(struct p2m_domain *p2m, unsigned long type)
-{
-    struct page_info *pg;
-
-    ASSERT(p2m);
-    ASSERT(p2m->domain);
-    ASSERT(p2m->domain->arch.paging.alloc_page);
-    pg = p2m->domain->arch.paging.alloc_page(p2m->domain);
-    if (pg == NULL)
-        return NULL;
-
-    page_list_add_tail(pg, &p2m->pages);
-    pg->u.inuse.type_info = type | 1 | PGT_validated;
-
-    return pg;
-}
-
-void
-p2m_free_ptp(struct p2m_domain *p2m, struct page_info *pg)
-{
-    ASSERT(pg);
-    ASSERT(p2m);
-    ASSERT(p2m->domain);
-    ASSERT(p2m->domain->arch.paging.free_page);
-
-    page_list_del(pg, &p2m->pages);
-    p2m->domain->arch.paging.free_page(p2m->domain, pg);
-
-    return;
-}
-
-/* Free intermediate tables from a p2m sub-tree */
-void
-p2m_free_entry(struct p2m_domain *p2m, l1_pgentry_t *p2m_entry, int page_order)
-{
-    /* End if the entry is a leaf entry. */
-    if ( page_order == 0
-         || !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT)
-         || (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
-        return;
-
-    if ( page_order > 9 )
-    {
-        l1_pgentry_t *l3_table = map_domain_page(l1e_get_pfn(*p2m_entry));
-        for ( int i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
-            p2m_free_entry(p2m, l3_table + i, page_order - 9);
-        unmap_domain_page(l3_table);
-    }
-
-    p2m_free_ptp(p2m, mfn_to_page(_mfn(l1e_get_pfn(*p2m_entry))));
-}
-
-// Walk one level of the P2M table, allocating a new table if required.
-// Returns 0 on error.
-//
-
-/* AMD IOMMU: Convert next level bits and r/w bits into 24 bits p2m flags */
-#define iommu_nlevel_to_flags(nl, f) ((((nl) & 0x7) << 9 )|(((f) & 0x3) << 21))
-
-static void p2m_add_iommu_flags(l1_pgentry_t *p2m_entry,
-                                unsigned int nlevel, unsigned int flags)
-{
-#if CONFIG_PAGING_LEVELS == 4
-    if ( iommu_hap_pt_share )
-        l1e_add_flags(*p2m_entry, iommu_nlevel_to_flags(nlevel, flags));
-#endif
-}
-
-static int
-p2m_next_level(struct p2m_domain *p2m, mfn_t *table_mfn, void **table,
-               unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
-               u32 max, unsigned long type)
-{
-    l1_pgentry_t *l1_entry;
-    l1_pgentry_t *p2m_entry;
-    l1_pgentry_t new_entry;
-    void *next;
-    int i;
-
-    if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
-                                      shift, max)) )
-        return 0;
-
-    /* PoD: Not present doesn't imply empty. */
-    if ( !l1e_get_flags(*p2m_entry) )
-    {
-        struct page_info *pg;
-
-        pg = p2m_alloc_ptp(p2m, type);
-        if ( pg == NULL )
-            return 0;
-
-        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
-                                 __PAGE_HYPERVISOR | _PAGE_USER);
-
-        switch ( type ) {
-        case PGT_l3_page_table:
-            p2m_add_iommu_flags(&new_entry, 3, 
IOMMUF_readable|IOMMUF_writable);
-            p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 
4);
-            break;
-        case PGT_l2_page_table:
-#if CONFIG_PAGING_LEVELS == 3
-            /* for PAE mode, PDPE only has PCD/PWT/P bits available */
-            new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), _PAGE_PRESENT);
-#endif
-            p2m_add_iommu_flags(&new_entry, 2, 
IOMMUF_readable|IOMMUF_writable);
-            p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 
3);
-            break;
-        case PGT_l1_page_table:
-            p2m_add_iommu_flags(&new_entry, 1, 
IOMMUF_readable|IOMMUF_writable);
-            p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 
2);
-            break;
-        default:
-            BUG();
-            break;
-        }
-    }
-
-    ASSERT(l1e_get_flags(*p2m_entry) & (_PAGE_PRESENT|_PAGE_PSE));
-
-    /* split 1GB pages into 2MB pages */
-    if ( type == PGT_l2_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
-    {
-        unsigned long flags, pfn;
-        struct page_info *pg;
-
-        pg = p2m_alloc_ptp(p2m, PGT_l2_page_table);
-        if ( pg == NULL )
-            return 0;
-
-        flags = l1e_get_flags(*p2m_entry);
-        pfn = l1e_get_pfn(*p2m_entry);
-
-        l1_entry = map_domain_page(mfn_x(page_to_mfn(pg)));
-        for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
-        {
-            new_entry = l1e_from_pfn(pfn + (i * L1_PAGETABLE_ENTRIES), flags);
-            p2m_add_iommu_flags(&new_entry, 1, 
IOMMUF_readable|IOMMUF_writable);
-            p2m->write_p2m_entry(p2m, gfn,
-                l1_entry+i, *table_mfn, new_entry, 2);
-        }
-        unmap_domain_page(l1_entry);
-        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
-                                 __PAGE_HYPERVISOR|_PAGE_USER); //disable PSE
-        p2m_add_iommu_flags(&new_entry, 2, IOMMUF_readable|IOMMUF_writable);
-        p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3);
-    }
-
-
-    /* split single 2MB large page into 4KB page in P2M table */
-    if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
-    {
-        unsigned long flags, pfn;
-        struct page_info *pg;
-
-        pg = p2m_alloc_ptp(p2m, PGT_l1_page_table);
-        if ( pg == NULL )
-            return 0;
-
-        /* New splintered mappings inherit the flags of the old superpage, 
-         * with a little reorganisation for the _PAGE_PSE_PAT bit. */
-        flags = l1e_get_flags(*p2m_entry);
-        pfn = l1e_get_pfn(*p2m_entry);
-        if ( pfn & 1 )           /* ==> _PAGE_PSE_PAT was set */
-            pfn -= 1;            /* Clear it; _PAGE_PSE becomes _PAGE_PAT */
-        else
-            flags &= ~_PAGE_PSE; /* Clear _PAGE_PSE (== _PAGE_PAT) */
-        
-        l1_entry = __map_domain_page(pg);
-        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
-        {
-            new_entry = l1e_from_pfn(pfn + i, flags);
-            p2m_add_iommu_flags(&new_entry, 0, 0);
-            p2m->write_p2m_entry(p2m, gfn,
-                l1_entry+i, *table_mfn, new_entry, 1);
-        }
-        unmap_domain_page(l1_entry);
-        
-        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
-                                 __PAGE_HYPERVISOR|_PAGE_USER);
-        p2m_add_iommu_flags(&new_entry, 1, IOMMUF_readable|IOMMUF_writable);
-        p2m->write_p2m_entry(p2m, gfn,
-            p2m_entry, *table_mfn, new_entry, 2);
-    }
-
-    *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
-    next = map_domain_page(mfn_x(*table_mfn));
-    unmap_domain_page(*table);
-    *table = next;
-
-    return 1;
-}
-
-/*
- * Populate-on-demand functionality
- */
-static
-int set_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, 
-                  unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma);
-
-static int
-p2m_pod_cache_add(struct p2m_domain *p2m,
-                  struct page_info *page,
-                  unsigned long order)
-{
-    int i;
-    struct page_info *p;
-    struct domain *d = p2m->domain;
-
-#ifndef NDEBUG
-    mfn_t mfn;
-
-    mfn = page_to_mfn(page);
-
-    /* Check to make sure this is a contiguous region */
-    if( mfn_x(mfn) & ((1 << order) - 1) )
-    {
-        printk("%s: mfn %lx not aligned order %lu! (mask %lx)\n",
-               __func__, mfn_x(mfn), order, ((1UL << order) - 1));
-        return -1;
-    }
-    
-    for(i=0; i < 1 << order ; i++) {
-        struct domain * od;
-
-        p = mfn_to_page(_mfn(mfn_x(mfn) + i));
-        od = page_get_owner(p);
-        if(od != d)
-        {
-            printk("%s: mfn %lx expected owner d%d, got owner d%d!\n",
-                   __func__, mfn_x(mfn), d->domain_id,
-                   od?od->domain_id:-1);
-            return -1;
-        }
-    }
-#endif
-
-    ASSERT(p2m_locked_by_me(p2m));
-
-    /*
-     * Pages from domain_alloc and returned by the balloon driver aren't
-     * guaranteed to be zero; but by reclaiming zero pages, we implicitly
-     * promise to provide zero pages. So we scrub pages before using.
-     */
-    for ( i = 0; i < (1 << order); i++ )
-    {
-        char *b = map_domain_page(mfn_x(page_to_mfn(page)) + i);
-        clear_page(b);
-        unmap_domain_page(b);
-    }
-
-    spin_lock(&d->page_alloc_lock);
-
-    /* First, take all pages off the domain list */
-    for(i=0; i < 1 << order ; i++)
-    {
-        p = page + i;
-        page_list_del(p, &d->page_list);
-    }
-
-    /* Then add the first one to the appropriate populate-on-demand list */
-    switch(order)
-    {
-    case 9:
-        page_list_add_tail(page, &p2m->pod.super); /* lock: page_alloc */
-        p2m->pod.count += 1 << order;
-        break;
-    case 0:
-        page_list_add_tail(page, &p2m->pod.single); /* lock: page_alloc */
-        p2m->pod.count += 1;
-        break;
-    default:
-        BUG();
-    }
-
-    /* Ensure that the PoD cache has never been emptied.  
-     * This may cause "zombie domains" since the page will never be freed. */
-    BUG_ON( d->arch.relmem != RELMEM_not_started );
-
-    spin_unlock(&d->page_alloc_lock);
-
-    return 0;
-}
-
-/* Get a page of size order from the populate-on-demand cache.  Will break
- * down 2-meg pages into singleton pages automatically.  Returns null if
- * a superpage is requested and no superpages are available.  Must be called
- * with the d->page_lock held. */
-static struct page_info * p2m_pod_cache_get(struct p2m_domain *p2m,
-                                            unsigned long order)
-{
-    struct page_info *p = NULL;
-    int i;
-
-    if ( order == 9 && page_list_empty(&p2m->pod.super) )
-    {
-        return NULL;
-    }
-    else if ( order == 0 && page_list_empty(&p2m->pod.single) )
-    {
-        unsigned long mfn;
-        struct page_info *q;
-
-        BUG_ON( page_list_empty(&p2m->pod.super) );
-
-        /* Break up a superpage to make single pages. NB count doesn't
-         * need to be adjusted. */
-        p = page_list_remove_head(&p2m->pod.super);
-        mfn = mfn_x(page_to_mfn(p));
-
-        for ( i=0; i<SUPERPAGE_PAGES; i++ )
-        {
-            q = mfn_to_page(_mfn(mfn+i));
-            page_list_add_tail(q, &p2m->pod.single);
-        }
-    }
-
-    switch ( order )
-    {
-    case 9:
-        BUG_ON( page_list_empty(&p2m->pod.super) );
-        p = page_list_remove_head(&p2m->pod.super);
-        p2m->pod.count -= 1 << order; /* Lock: page_alloc */
-        break;
-    case 0:
-        BUG_ON( page_list_empty(&p2m->pod.single) );
-        p = page_list_remove_head(&p2m->pod.single);
-        p2m->pod.count -= 1;
-        break;
-    default:
-        BUG();
-    }
-
-    /* Put the pages back on the domain page_list */
-    for ( i = 0 ; i < (1 << order); i++ )
-    {
-        BUG_ON(page_get_owner(p + i) != p2m->domain);
-        page_list_add_tail(p + i, &p2m->domain->page_list);
-    }
-
-    return p;
-}
-
-/* Set the size of the cache, allocating or freeing as necessary. */
-static int
-p2m_pod_set_cache_target(struct p2m_domain *p2m, unsigned long pod_target, int 
preemptible)
-{
-    struct domain *d = p2m->domain;
-    int ret = 0;
-
-    /* Increasing the target */
-    while ( pod_target > p2m->pod.count )
-    {
-        struct page_info * page;
-        int order;
-
-        if ( (pod_target - p2m->pod.count) >= SUPERPAGE_PAGES )
-            order = 9;
-        else
-            order = 0;
-    retry:
-        page = alloc_domheap_pages(d, order, 0);
-        if ( unlikely(page == NULL) )
-        {
-            if ( order == 9 )
-            {
-                /* If we can't allocate a superpage, try singleton pages */
-                order = 0;
-                goto retry;
-            }   
-            
-            printk("%s: Unable to allocate domheap page for pod cache.  target 
%lu cachesize %d\n",
-                   __func__, pod_target, p2m->pod.count);
-            ret = -ENOMEM;
-            goto out;
-        }
-
-        p2m_pod_cache_add(p2m, page, order);
-
-        if ( hypercall_preempt_check() && preemptible )
-        {
-            ret = -EAGAIN;
-            goto out;
-        }
-    }
-
-    /* Decreasing the target */
-    /* We hold the p2m lock here, so we don't need to worry about
-     * cache disappearing under our feet. */
-    while ( pod_target < p2m->pod.count )
-    {
-        struct page_info * page;
-        int order, i;
-
-        /* Grab the lock before checking that pod.super is empty, or the last
-         * entries may disappear before we grab the lock. */
-        spin_lock(&d->page_alloc_lock);
-
-        if ( (p2m->pod.count - pod_target) > SUPERPAGE_PAGES
-             && !page_list_empty(&p2m->pod.super) )
-            order = 9;
-        else
-            order = 0;
-
-        page = p2m_pod_cache_get(p2m, order);
-
-        ASSERT(page != NULL);
-
-        spin_unlock(&d->page_alloc_lock);
-
-        /* Then free them */
-        for ( i = 0 ; i < (1 << order) ; i++ )
-        {
-            /* Copied from common/memory.c:guest_remove_page() */
-            if ( unlikely(!get_page(page+i, d)) )
-            {
-                gdprintk(XENLOG_INFO, "Bad page free for domain %u\n", 
d->domain_id);
-                ret = -EINVAL;
-                goto out;
-            }
-
-            if ( test_and_clear_bit(_PGT_pinned, &(page+i)->u.inuse.type_info) 
)
-                put_page_and_type(page+i);
-            
-            if ( test_and_clear_bit(_PGC_allocated, &(page+i)->count_info) )
-                put_page(page+i);
-
-            put_page(page+i);
-
-            if ( hypercall_preempt_check() && preemptible )
-            {
-                ret = -EAGAIN;
-                goto out;
-            }
-        }
-    }
-
-out:
-    return ret;
-}
-
-/*
- * The "right behavior" here requires some careful thought.  First, some
- * definitions:
- * + M: static_max
- * + B: number of pages the balloon driver has ballooned down to.
- * + P: Number of populated pages. 
- * + T: Old target
- * + T': New target
- *
- * The following equations should hold:
- *  0 <= P <= T <= B <= M
- *  d->arch.p2m->pod.entry_count == B - P
- *  d->tot_pages == P + d->arch.p2m->pod.count
- *
- * Now we have the following potential cases to cover:
- *     B <T': Set the PoD cache size equal to the number of outstanding PoD
- *   entries.  The balloon driver will deflate the balloon to give back
- *   the remainder of the ram to the guest OS.
- *  T <T'<B : Increase PoD cache size.
- *  T'<T<=B : Here we have a choice.  We can decrease the size of the cache,
- *   get the memory right away.  However, that means every time we 
- *   reduce the memory target we risk the guest attempting to populate the 
- *   memory before the balloon driver has reached its new target.  Safer to
- *   never reduce the cache size here, but only when the balloon driver frees 
- *   PoD ranges.
- *
- * If there are many zero pages, we could reach the target also by doing
- * zero sweeps and marking the ranges PoD; but the balloon driver will have
- * to free this memory eventually anyway, so we don't actually gain that much
- * by doing so.
- *
- * NB that the equation (B<T') may require adjustment to the cache
- * size as PoD pages are freed as well; i.e., freeing a PoD-backed
- * entry when pod.entry_count == pod.count requires us to reduce both
- * pod.entry_count and pod.count.
- */
-int
-p2m_pod_set_mem_target(struct domain *d, unsigned long target)
-{
-    unsigned pod_target;
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-    int ret = 0;
-    unsigned long populated;
-
-    p2m_lock(p2m);
-
-    /* P == B: Nothing to do. */
-    if ( p2m->pod.entry_count == 0 )
-        goto out;
-
-    /* Don't do anything if the domain is being torn down */
-    if ( d->is_dying )
-        goto out;
-
-    /* T' < B: Don't reduce the cache size; let the balloon driver
-     * take care of it. */
-    if ( target < d->tot_pages )
-        goto out;
-
-    populated  = d->tot_pages - p2m->pod.count;
-
-    pod_target = target - populated;
-
-    /* B < T': Set the cache size equal to # of outstanding entries,
-     * let the balloon driver fill in the rest. */
-    if ( pod_target > p2m->pod.entry_count )
-        pod_target = p2m->pod.entry_count;
-
-    ASSERT( pod_target >= p2m->pod.count );
-
-    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
-
-out:
-    p2m_unlock(p2m);
-
-    return ret;
-}
-
-void
-p2m_pod_empty_cache(struct domain *d)
-{
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-    struct page_info *page;
-
-    /* After this barrier no new PoD activities can happen. */
-    BUG_ON(!d->is_dying);
-    spin_barrier(&p2m->lock);
-
-    spin_lock(&d->page_alloc_lock);
-
-    while ( (page = page_list_remove_head(&p2m->pod.super)) )
-    {
-        int i;
-            
-        for ( i = 0 ; i < SUPERPAGE_PAGES ; i++ )
-        {
-            BUG_ON(page_get_owner(page + i) != d);
-            page_list_add_tail(page + i, &d->page_list);
-        }
-
-        p2m->pod.count -= SUPERPAGE_PAGES;
-    }
-
-    while ( (page = page_list_remove_head(&p2m->pod.single)) )
-    {
-        BUG_ON(page_get_owner(page) != d);
-        page_list_add_tail(page, &d->page_list);
-
-        p2m->pod.count -= 1;
-    }
-
-    BUG_ON(p2m->pod.count != 0);
-
-    spin_unlock(&d->page_alloc_lock);
-}
-
-int
-p2m_pod_offline_or_broken_hit(struct page_info *p)
-{
-    struct domain *d;
-    struct p2m_domain *p2m;
-    struct page_info *q, *tmp;
-    unsigned long mfn, bmfn;
-
-    if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) )
-        return 0;
-
-    spin_lock(&d->page_alloc_lock);
-    bmfn = mfn_x(page_to_mfn(p));
-    page_list_for_each_safe(q, tmp, &p2m->pod.super)
-    {
-        mfn = mfn_x(page_to_mfn(q));
-        if ( (bmfn >= mfn) && ((bmfn - mfn) < SUPERPAGE_PAGES) )
-        {
-            unsigned long i;
-            page_list_del(q, &p2m->pod.super);
-            for ( i = 0; i < SUPERPAGE_PAGES; i++)
-            {
-                q = mfn_to_page(_mfn(mfn + i));
-                page_list_add_tail(q, &p2m->pod.single);
-            }
-            page_list_del(p, &p2m->pod.single);
-            p2m->pod.count--;
-            goto pod_hit;
-        }
-    }
-
-    page_list_for_each_safe(q, tmp, &p2m->pod.single)
-    {
-        mfn = mfn_x(page_to_mfn(q));
-        if ( mfn == bmfn )
-        {
-            page_list_del(p, &p2m->pod.single);
-            p2m->pod.count--;
-            goto pod_hit;
-        }
-    }
-
-    spin_unlock(&d->page_alloc_lock);
-    return 0;
-
-pod_hit:
-    page_list_add_tail(p, &d->arch.relmem_list);
-    spin_unlock(&d->page_alloc_lock);
-    return 1;
-}
-
-void
-p2m_pod_offline_or_broken_replace(struct page_info *p)
-{
-    struct domain *d;
-    struct p2m_domain *p2m;
-
-    if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) )
-        return;
-
-    free_domheap_page(p);
-
-    p = alloc_domheap_page(d, 0);
-    if ( unlikely(!p) )
-        return;
-
-    p2m_lock(p2m);
-    p2m_pod_cache_add(p2m, p, 0);
-    p2m_unlock(p2m);
-    return;
-}
-
-/* This function is needed for two reasons:
- * + To properly handle clearing of PoD entries
- * + To "steal back" memory being freed for the PoD cache, rather than
- *   releasing it.
- *
- * Once both of these functions have been completed, we can return and
- * allow decrease_reservation() to handle everything else.
- */
-int
-p2m_pod_decrease_reservation(struct domain *d,
-                             xen_pfn_t gpfn,
-                             unsigned int order)
-{
-    int ret=0;
-    int i;
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-
-    int steal_for_cache = 0;
-    int pod = 0, nonpod = 0, ram = 0;
-    
-
-    /* If we don't have any outstanding PoD entries, let things take their
-     * course */
-    if ( p2m->pod.entry_count == 0 )
-        goto out;
-
-    /* Figure out if we need to steal some freed memory for our cache */
-    steal_for_cache =  ( p2m->pod.entry_count > p2m->pod.count );
-
-    p2m_lock(p2m);
-    audit_p2m(p2m, 1);
-
-    if ( unlikely(d->is_dying) )
-        goto out_unlock;
-
-    /* See what's in here. */
-    /* FIXME: Add contiguous; query for PSE entries? */
-    for ( i=0; i<(1<<order); i++)
-    {
-        p2m_type_t t;
-
-        gfn_to_mfn_query(p2m, gpfn + i, &t);
-
-        if ( t == p2m_populate_on_demand )
-            pod++;
-        else
-        {
-            nonpod++;
-            if ( p2m_is_ram(t) )
-                ram++;
-        }
-    }
-
-    /* No populate-on-demand?  Don't need to steal anything?  Then we're 
done!*/
-    if(!pod && !steal_for_cache)
-        goto out_unlock;
-
-    if ( !nonpod )
-    {
-        /* All PoD: Mark the whole region invalid and tell caller
-         * we're done. */
-        set_p2m_entry(p2m, gpfn, _mfn(INVALID_MFN), order, p2m_invalid, 
p2m->default_access);
-        p2m->pod.entry_count-=(1<<order); /* Lock: p2m */
-        BUG_ON(p2m->pod.entry_count < 0);
-        ret = 1;
-        goto out_entry_check;
-    }
-
-    /* FIXME: Steal contig 2-meg regions for cache */
-
-    /* Process as long as:
-     * + There are PoD entries to handle, or
-     * + There is ram left, and we want to steal it
-     */
-    for ( i=0;
-          i<(1<<order) && (pod>0 || (steal_for_cache && ram > 0));
-          i++)
-    {
-        mfn_t mfn;
-        p2m_type_t t;
-
-        mfn = gfn_to_mfn_query(p2m, gpfn + i, &t);
-        if ( t == p2m_populate_on_demand )
-        {
-            set_p2m_entry(p2m, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid, 
p2m->default_access);
-            p2m->pod.entry_count--; /* Lock: p2m */
-            BUG_ON(p2m->pod.entry_count < 0);
-            pod--;
-        }
-        else if ( steal_for_cache && p2m_is_ram(t) )
-        {
-            struct page_info *page;
-
-            ASSERT(mfn_valid(mfn));
-
-            page = mfn_to_page(mfn);
-
-            set_p2m_entry(p2m, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid, 
p2m->default_access);
-            set_gpfn_from_mfn(mfn_x(mfn), INVALID_M2P_ENTRY);
-
-            p2m_pod_cache_add(p2m, page, 0);
-
-            steal_for_cache =  ( p2m->pod.entry_count > p2m->pod.count );
-
-            nonpod--;
-            ram--;
-        }
-    }    
-
-    /* If there are no more non-PoD entries, tell decrease_reservation() that
-     * there's nothing left to do. */
-    if ( nonpod == 0 )
-        ret = 1;
-
-out_entry_check:
-    /* If we've reduced our "liabilities" beyond our "assets", free some */
-    if ( p2m->pod.entry_count < p2m->pod.count )
-    {
-        p2m_pod_set_cache_target(p2m, p2m->pod.entry_count, 0/*can't 
preempt*/);
-    }
-
-out_unlock:
-    audit_p2m(p2m, 1);
-    p2m_unlock(p2m);
-
-out:
-    return ret;
-}
-
-void
-p2m_pod_dump_data(struct p2m_domain *p2m)
-{
-    printk("    PoD entries=%d cachesize=%d\n",
-           p2m->pod.entry_count, p2m->pod.count);
-}
-
-
-/* Search for all-zero superpages to be reclaimed as superpages for the
- * PoD cache. Must be called w/ p2m lock held, page_alloc lock not held. */
-static int
-p2m_pod_zero_check_superpage(struct p2m_domain *p2m, unsigned long gfn)
-{
-    mfn_t mfn, mfn0 = _mfn(INVALID_MFN);
-    p2m_type_t type, type0 = 0;
-    unsigned long * map = NULL;
-    int ret=0, reset = 0;
-    int i, j;
-    int max_ref = 1;
-    struct domain *d = p2m->domain;
-
-    if ( !superpage_aligned(gfn) )
-        goto out;
-
-    /* Allow an extra refcount for one shadow pt mapping in shadowed domains */
-    if ( paging_mode_shadow(d) )
-        max_ref++;
-
-    /* Look up the mfns, checking to make sure they're the same mfn
-     * and aligned, and mapping them. */
-    for ( i=0; i<SUPERPAGE_PAGES; i++ )
-    {
-        
-        mfn = gfn_to_mfn_query(p2m, gfn + i, &type);
-
-        if ( i == 0 )
-        {
-            mfn0 = mfn;
-            type0 = type;
-        }
-
-        /* Conditions that must be met for superpage-superpage:
-         * + All gfns are ram types
-         * + All gfns have the same type
-         * + All of the mfns are allocated to a domain
-         * + None of the mfns are used as pagetables, or allocated via xenheap
-         * + The first mfn is 2-meg aligned
-         * + All the other mfns are in sequence
-         * Adding for good measure:
-         * + None of the mfns are likely to be mapped elsewhere (refcount
-         *   2 or less for shadow, 1 for hap)
-         */
-        if ( !p2m_is_ram(type)
-             || type != type0
-             || ( (mfn_to_page(mfn)->count_info & PGC_allocated) == 0 )
-             || ( (mfn_to_page(mfn)->count_info & 
(PGC_page_table|PGC_xen_heap)) != 0 )
-             || ( (mfn_to_page(mfn)->count_info & PGC_xen_heap  ) != 0 )
-             || ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > max_ref )
-             || !( ( i == 0 && superpage_aligned(mfn_x(mfn0)) )
-                   || ( i != 0 && mfn_x(mfn) == (mfn_x(mfn0) + i) ) ) )
-            goto out;
-    }
-
-    /* Now, do a quick check to see if it may be zero before unmapping. */
-    for ( i=0; i<SUPERPAGE_PAGES; i++ )
-    {
-        /* Quick zero-check */
-        map = map_domain_page(mfn_x(mfn0) + i);
-
-        for ( j=0; j<16; j++ )
-            if( *(map+j) != 0 )
-                break;
-
-        unmap_domain_page(map);
-
-        if ( j < 16 )
-            goto out;
-
-    }
-
-    /* Try to remove the page, restoring old mapping if it fails. */
-    set_p2m_entry(p2m, gfn,
-                  _mfn(POPULATE_ON_DEMAND_MFN), 9,
-                  p2m_populate_on_demand, p2m->default_access);
-
-    /* Make none of the MFNs are used elsewhere... for example, mapped
-     * via the grant table interface, or by qemu.  Allow one refcount for
-     * being allocated to the domain. */
-    for ( i=0; i < SUPERPAGE_PAGES; i++ )
-    {
-        mfn = _mfn(mfn_x(mfn0) + i);
-        if ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > 1 )
-        {
-            reset = 1;
-            goto out_reset;
-        }
-    }
-
-    /* Finally, do a full zero-check */
-    for ( i=0; i < SUPERPAGE_PAGES; i++ )
-    {
-        map = map_domain_page(mfn_x(mfn0) + i);
-
-        for ( j=0; j<PAGE_SIZE/sizeof(*map); j++ )
-            if( *(map+j) != 0 )
-            {
-                reset = 1;
-                break;
-            }
-
-        unmap_domain_page(map);
-
-        if ( reset )
-            goto out_reset;
-    }
-
-    if ( tb_init_done )
-    {
-        struct {
-            u64 gfn, mfn;
-            int d:16,order:16;
-        } t;
-
-        t.gfn = gfn;
-        t.mfn = mfn_x(mfn);
-        t.d = d->domain_id;
-        t.order = 9;
-
-        __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t);
-    }
-
-    /* Finally!  We've passed all the checks, and can add the mfn superpage
-     * back on the PoD cache, and account for the new p2m PoD entries */
-    p2m_pod_cache_add(p2m, mfn_to_page(mfn0), 9);
-    p2m->pod.entry_count += SUPERPAGE_PAGES;
-
-out_reset:
-    if ( reset )
-        set_p2m_entry(p2m, gfn, mfn0, 9, type0, p2m->default_access);
-    
-out:
-    return ret;
-}
-
-static void
-p2m_pod_zero_check(struct p2m_domain *p2m, unsigned long *gfns, int count)
-{
-    mfn_t mfns[count];
-    p2m_type_t types[count];
-    unsigned long * map[count];
-    struct domain *d = p2m->domain;
-
-    int i, j;
-    int max_ref = 1;
-
-    /* Allow an extra refcount for one shadow pt mapping in shadowed domains */
-    if ( paging_mode_shadow(d) )
-        max_ref++;
-
-    /* First, get the gfn list, translate to mfns, and map the pages. */
-    for ( i=0; i<count; i++ )
-    {
-        mfns[i] = gfn_to_mfn_query(p2m, gfns[i], types + i);
-        /* If this is ram, and not a pagetable or from the xen heap, and 
probably not mapped
-           elsewhere, map it; otherwise, skip. */
-        if ( p2m_is_ram(types[i])
-             && ( (mfn_to_page(mfns[i])->count_info & PGC_allocated) != 0 ) 
-             && ( (mfn_to_page(mfns[i])->count_info & 
(PGC_page_table|PGC_xen_heap)) == 0 ) 
-             && ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) <= 
max_ref ) )
-            map[i] = map_domain_page(mfn_x(mfns[i]));
-        else
-            map[i] = NULL;
-    }
-
-    /* Then, go through and check for zeroed pages, removing write permission
-     * for those with zeroes. */
-    for ( i=0; i<count; i++ )
-    {
-        if(!map[i])
-            continue;
-
-        /* Quick zero-check */
-        for ( j=0; j<16; j++ )
-            if( *(map[i]+j) != 0 )
-                break;
-
-        if ( j < 16 )
-        {
-            unmap_domain_page(map[i]);
-            map[i] = NULL;
-            continue;
-        }
-
-        /* Try to remove the page, restoring old mapping if it fails. */
-        set_p2m_entry(p2m, gfns[i],
-                      _mfn(POPULATE_ON_DEMAND_MFN), 0,
-                      p2m_populate_on_demand, p2m->default_access);
-
-        /* See if the page was successfully unmapped.  (Allow one refcount
-         * for being allocated to a domain.) */
-        if ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) > 1 )
-        {
-            unmap_domain_page(map[i]);
-            map[i] = NULL;
-
-            set_p2m_entry(p2m, gfns[i], mfns[i], 0, types[i], 
p2m->default_access);
-
-            continue;
-        }
-    }
-
-    /* Now check each page for real */
-    for ( i=0; i < count; i++ )
-    {
-        if(!map[i])
-            continue;
-
-        for ( j=0; j<PAGE_SIZE/sizeof(*map[i]); j++ )
-            if( *(map[i]+j) != 0 )
-                break;
-
-        unmap_domain_page(map[i]);
-
-        /* See comment in p2m_pod_zero_check_superpage() re gnttab
-         * check timing.  */
-        if ( j < PAGE_SIZE/sizeof(*map[i]) )
-        {
-            set_p2m_entry(p2m, gfns[i], mfns[i], 0, types[i], 
p2m->default_access);
-        }
-        else
-        {
-            if ( tb_init_done )
-            {
-                struct {
-                    u64 gfn, mfn;
-                    int d:16,order:16;
-                } t;
-
-                t.gfn = gfns[i];
-                t.mfn = mfn_x(mfns[i]);
-                t.d = d->domain_id;
-                t.order = 0;
-        
-                __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t);
-            }
-
-            /* Add to cache, and account for the new p2m PoD entry */
-            p2m_pod_cache_add(p2m, mfn_to_page(mfns[i]), 0);
-            p2m->pod.entry_count++;
-        }
-    }
-    
-}
-
-#define POD_SWEEP_LIMIT 1024
-static void
-p2m_pod_emergency_sweep_super(struct p2m_domain *p2m)
-{
-    unsigned long i, start, limit;
-
-    if ( p2m->pod.reclaim_super == 0 )
-    {
-        p2m->pod.reclaim_super = (p2m->pod.max_guest>>9)<<9;
-        p2m->pod.reclaim_super -= SUPERPAGE_PAGES;
-    }
-    
-    start = p2m->pod.reclaim_super;
-    limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0;
-
-    for ( i=p2m->pod.reclaim_super ; i > 0 ; i -= SUPERPAGE_PAGES )
-    {
-        p2m_pod_zero_check_superpage(p2m, i);
-        /* Stop if we're past our limit and we have found *something*.
-         *
-         * NB that this is a zero-sum game; we're increasing our cache size
-         * by increasing our 'debt'.  Since we hold the p2m lock,
-         * (entry_count - count) must remain the same. */
-        if ( !page_list_empty(&p2m->pod.super) &&  i < limit )
-            break;
-    }
-
-    p2m->pod.reclaim_super = i ? i - SUPERPAGE_PAGES : 0;
-}
-
-#define POD_SWEEP_STRIDE  16
-static void
-p2m_pod_emergency_sweep(struct p2m_domain *p2m)
-{
-    unsigned long gfns[POD_SWEEP_STRIDE];
-    unsigned long i, j=0, start, limit;
-    p2m_type_t t;
-
-
-    if ( p2m->pod.reclaim_single == 0 )
-        p2m->pod.reclaim_single = p2m->pod.max_guest;
-
-    start = p2m->pod.reclaim_single;
-    limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0;
-
-    /* FIXME: Figure out how to avoid superpages */
-    for ( i=p2m->pod.reclaim_single; i > 0 ; i-- )
-    {
-        gfn_to_mfn_query(p2m, i, &t );
-        if ( p2m_is_ram(t) )
-        {
-            gfns[j] = i;
-            j++;
-            BUG_ON(j > POD_SWEEP_STRIDE);
-            if ( j == POD_SWEEP_STRIDE )
-            {
-                p2m_pod_zero_check(p2m, gfns, j);
-                j = 0;
-            }
-        }
-        /* Stop if we're past our limit and we have found *something*.
-         *
-         * NB that this is a zero-sum game; we're increasing our cache size
-         * by re-increasing our 'debt'.  Since we hold the p2m lock,
-         * (entry_count - count) must remain the same. */
-        if ( p2m->pod.count > 0 && i < limit )
-            break;
-    }
-
-    if ( j )
-        p2m_pod_zero_check(p2m, gfns, j);
-
-    p2m->pod.reclaim_single = i ? i - 1 : i;
-
-}
-
-int
-p2m_pod_demand_populate(struct p2m_domain *p2m, unsigned long gfn,
-                        unsigned int order,
-                        p2m_query_t q)
-{
-    struct domain *d = p2m->domain;
-    struct page_info *p = NULL; /* Compiler warnings */
-    unsigned long gfn_aligned;
-    mfn_t mfn;
-    int i;
-
-    ASSERT(p2m_locked_by_me(p2m));
-
-    /* This check is done with the p2m lock held.  This will make sure that
-     * even if d->is_dying changes under our feet, p2m_pod_empty_cache() 
-     * won't start until we're done. */
-    if ( unlikely(d->is_dying) )
-        goto out_fail;
-
-    /* Because PoD does not have cache list for 1GB pages, it has to remap
-     * 1GB region to 2MB chunks for a retry. */
-    if ( order == 18 )
-    {
-        gfn_aligned = (gfn >> order) << order;
-        /* Note that we are supposed to call set_p2m_entry() 512 times to 
-         * split 1GB into 512 2MB pages here. But We only do once here because
-         * set_p2m_entry() should automatically shatter the 1GB page into 
-         * 512 2MB pages. The rest of 511 calls are unnecessary.
-         */
-        set_p2m_entry(p2m, gfn_aligned, _mfn(POPULATE_ON_DEMAND_MFN), 9,
-                      p2m_populate_on_demand, p2m->default_access);
-        audit_p2m(p2m, 1);
-        p2m_unlock(p2m);
-        return 0;
-    }
-
-    /* Once we've ballooned down enough that we can fill the remaining
-     * PoD entries from the cache, don't sweep even if the particular
-     * list we want to use is empty: that can lead to thrashing zero pages 
-     * through the cache for no good reason.  */
-    if ( p2m->pod.entry_count > p2m->pod.count )
-    {
-
-        /* If we're low, start a sweep */
-        if ( order == 9 && page_list_empty(&p2m->pod.super) )
-            p2m_pod_emergency_sweep_super(p2m);
-
-        if ( page_list_empty(&p2m->pod.single) &&
-             ( ( order == 0 )
-               || (order == 9 && page_list_empty(&p2m->pod.super) ) ) )
-            p2m_pod_emergency_sweep(p2m);
-    }
-
-    /* Keep track of the highest gfn demand-populated by a guest fault */
-    if ( q == p2m_guest && gfn > p2m->pod.max_guest )
-        p2m->pod.max_guest = gfn;
-
-    spin_lock(&d->page_alloc_lock);
-
-    if ( p2m->pod.count == 0 )
-        goto out_of_memory;
-
-    /* Get a page f/ the cache.  A NULL return value indicates that the
-     * 2-meg range should be marked singleton PoD, and retried */
-    if ( (p = p2m_pod_cache_get(p2m, order)) == NULL )
-        goto remap_and_retry;
-
-    mfn = page_to_mfn(p);
-
-    BUG_ON((mfn_x(mfn) & ((1 << order)-1)) != 0);
-
-    spin_unlock(&d->page_alloc_lock);
-
-    gfn_aligned = (gfn >> order) << order;
-
-    set_p2m_entry(p2m, gfn_aligned, mfn, order, p2m_ram_rw, 
p2m->default_access);
-
-    for( i = 0; i < (1UL << order); i++ )
-    {
-        set_gpfn_from_mfn(mfn_x(mfn) + i, gfn_aligned + i);
-        paging_mark_dirty(d, mfn_x(mfn) + i);
-    }
-    
-    p2m->pod.entry_count -= (1 << order); /* Lock: p2m */
-    BUG_ON(p2m->pod.entry_count < 0);
-
-    if ( tb_init_done )
-    {
-        struct {
-            u64 gfn, mfn;
-            int d:16,order:16;
-        } t;
-
-        t.gfn = gfn;
-        t.mfn = mfn_x(mfn);
-        t.d = d->domain_id;
-        t.order = order;
-        
-        __trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t);
-    }
-
-    return 0;
-out_of_memory:
-    spin_unlock(&d->page_alloc_lock);
-
-    printk("%s: Out of populate-on-demand memory! tot_pages %" PRIu32 " 
pod_entries %" PRIi32 "\n",
-           __func__, d->tot_pages, p2m->pod.entry_count);
-    domain_crash(d);
-out_fail:
-    return -1;
-remap_and_retry:
-    BUG_ON(order != 9);
-    spin_unlock(&d->page_alloc_lock);
-
-    /* Remap this 2-meg region in singleton chunks */
-    gfn_aligned = (gfn>>order)<<order;
-    for(i=0; i<(1<<order); i++)
-        set_p2m_entry(p2m, gfn_aligned+i, _mfn(POPULATE_ON_DEMAND_MFN), 0,
-                      p2m_populate_on_demand, p2m->default_access);
-    if ( tb_init_done )
-    {
-        struct {
-            u64 gfn;
-            int d:16;
-        } t;
-
-        t.gfn = gfn;
-        t.d = d->domain_id;
-        
-        __trace_var(TRC_MEM_POD_SUPERPAGE_SPLINTER, 0, sizeof(t), &t);
-    }
-
-    return 0;
-}
-
-/* Non-ept "lock-and-check" wrapper */
-static int p2m_pod_check_and_populate(struct p2m_domain *p2m, unsigned long 
gfn,
-                                      l1_pgentry_t *p2m_entry, int order,
-                                      p2m_query_t q)
-{
-    /* Only take the lock if we don't already have it.  Otherwise it
-     * wouldn't be safe to do p2m lookups with the p2m lock held */
-    int do_locking = !p2m_locked_by_me(p2m);
-    int r;
-
-    if ( do_locking )
-        p2m_lock(p2m);
-
-    audit_p2m(p2m, 1);
-
-    /* Check to make sure this is still PoD */
-    if ( p2m_flags_to_type(l1e_get_flags(*p2m_entry)) != 
p2m_populate_on_demand )
-    {
-        if ( do_locking )
-            p2m_unlock(p2m);
-        return 0;
-    }
-
-    r = p2m_pod_demand_populate(p2m, gfn, order, q);
-
-    audit_p2m(p2m, 1);
-    if ( do_locking )
-        p2m_unlock(p2m);
-
-    return r;
-}
-
-// Returns 0 on error (out of memory)
-static int
-p2m_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, 
-              unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma)
-{
-    // XXX -- this might be able to be faster iff current->domain == d
-    mfn_t table_mfn = pagetable_get_mfn(p2m_get_pagetable(p2m));
-    void *table =map_domain_page(mfn_x(table_mfn));
-    unsigned long i, gfn_remainder = gfn;
-    l1_pgentry_t *p2m_entry;
-    l1_pgentry_t entry_content;
-    l2_pgentry_t l2e_content;
-    l3_pgentry_t l3e_content;
-    int rv=0;
-    unsigned int iommu_pte_flags = (p2mt == p2m_ram_rw) ?
-                                   IOMMUF_readable|IOMMUF_writable:
-                                   0; 
-    unsigned long old_mfn = 0;
-
-    if ( tb_init_done )
-    {
-        struct {
-            u64 gfn, mfn;
-            int p2mt;
-            int d:16,order:16;
-        } t;
-
-        t.gfn = gfn;
-        t.mfn = mfn_x(mfn);
-        t.p2mt = p2mt;
-        t.d = p2m->domain->domain_id;
-        t.order = page_order;
-
-        __trace_var(TRC_MEM_SET_P2M_ENTRY, 0, sizeof(t), &t);
-    }
-
-#if CONFIG_PAGING_LEVELS >= 4
-    if ( !p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn,
-                         L4_PAGETABLE_SHIFT - PAGE_SHIFT,
-                         L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
-        goto out;
-#endif
-    /*
-     * Try to allocate 1GB page table if this feature is supported.
-     */
-    if ( page_order == 18 )
-    {
-        l1_pgentry_t old_entry = l1e_empty();
-        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
-                                   L3_PAGETABLE_SHIFT - PAGE_SHIFT,
-                                   L3_PAGETABLE_ENTRIES);
-        ASSERT(p2m_entry);
-        if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
-             !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
-        {
-            /* We're replacing a non-SP page with a superpage.  Make sure to
-             * handle freeing the table properly. */
-            old_entry = *p2m_entry;
-        }
-
-        ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
-        l3e_content = mfn_valid(mfn) 
-            ? l3e_from_pfn(mfn_x(mfn),
-                           p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE)
-            : l3e_empty();
-        entry_content.l1 = l3e_content.l3;
-
-        if ( entry_content.l1 != 0 )
-        {
-            p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
-            old_mfn = l1e_get_pfn(*p2m_entry);
-        }
-
-        p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 3);
-        /* NB: paging_write_p2m_entry() handles tlb flushes properly */
-
-        /* Free old intermediate tables if necessary */
-        if ( l1e_get_flags(old_entry) & _PAGE_PRESENT )
-            p2m_free_entry(p2m, &old_entry, page_order);
-    }
-    /*
-     * When using PAE Xen, we only allow 33 bits of pseudo-physical
-     * address in translated guests (i.e. 8 GBytes).  This restriction
-     * comes from wanting to map the P2M table into the 16MB RO_MPT hole
-     * in Xen's address space for translated PV guests.
-     * When using AMD's NPT on PAE Xen, we are restricted to 4GB.
-     */
-    else if ( !p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn,
-                              L3_PAGETABLE_SHIFT - PAGE_SHIFT,
-                              ((CONFIG_PAGING_LEVELS == 3)
-                               ? (hap_enabled(p2m->domain) ? 4 : 8)
-                               : L3_PAGETABLE_ENTRIES),
-                              PGT_l2_page_table) )
-        goto out;
-
-    if ( page_order == 0 )
-    {
-        if ( !p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn,
-                             L2_PAGETABLE_SHIFT - PAGE_SHIFT,
-                             L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
-            goto out;
-
-        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
-                                   0, L1_PAGETABLE_ENTRIES);
-        ASSERT(p2m_entry);
-        
-        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
-            entry_content = l1e_from_pfn(mfn_x(mfn),
-                                         p2m_type_to_flags(p2mt, mfn));
-        else
-            entry_content = l1e_empty();
-
-        if ( entry_content.l1 != 0 )
-        {
-            p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
-            old_mfn = l1e_get_pfn(*p2m_entry);
-        }
-        /* level 1 entry */
-        p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 1);
-        /* NB: paging_write_p2m_entry() handles tlb flushes properly */
-    }
-    else if ( page_order == 9 )
-    {
-        l1_pgentry_t old_entry = l1e_empty();
-        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
-                                   L2_PAGETABLE_SHIFT - PAGE_SHIFT,
-                                   L2_PAGETABLE_ENTRIES);
-        ASSERT(p2m_entry);
-        
-        /* FIXME: Deal with 4k replaced by 2meg pages */
-        if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
-             !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
-        {
-            /* We're replacing a non-SP page with a superpage.  Make sure to
-             * handle freeing the table properly. */
-            old_entry = *p2m_entry;
-        }
-        
-        ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
-        if ( mfn_valid(mfn) || p2m_is_magic(p2mt) )
-            l2e_content = l2e_from_pfn(mfn_x(mfn),
-                                       p2m_type_to_flags(p2mt, mfn) |
-                                       _PAGE_PSE);
-        else
-            l2e_content = l2e_empty();
-        
-        entry_content.l1 = l2e_content.l2;
-
-        if ( entry_content.l1 != 0 )
-        {
-            p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
-            old_mfn = l1e_get_pfn(*p2m_entry);
-        }
-
-        p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 2);
-        /* NB: paging_write_p2m_entry() handles tlb flushes properly */
-
-        /* Free old intermediate tables if necessary */
-        if ( l1e_get_flags(old_entry) & _PAGE_PRESENT )
-            p2m_free_entry(p2m, &old_entry, page_order);
-    }
-
-    /* Track the highest gfn for which we have ever had a valid mapping */
-    if ( mfn_valid(mfn) 
-         && (gfn + (1UL << page_order) - 1 > p2m->max_mapped_pfn) )
-        p2m->max_mapped_pfn = gfn + (1UL << page_order) - 1;
-
-    if ( iommu_enabled && need_iommu(p2m->domain) )
-    {
-        if ( iommu_hap_pt_share )
-        {
-            if ( old_mfn && (old_mfn != mfn_x(mfn)) )
-                amd_iommu_flush_pages(p2m->domain, gfn, page_order);
-        }
-        else
-        {
-            if ( p2mt == p2m_ram_rw )
-                for ( i = 0; i < (1UL << page_order); i++ )
-                    iommu_map_page(p2m->domain, gfn+i, mfn_x(mfn)+i,
-                                   IOMMUF_readable|IOMMUF_writable);
-            else
-                for ( int i = 0; i < (1UL << page_order); i++ )
-                    iommu_unmap_page(p2m->domain, gfn+i);
-        }
-    }
-
-    /* Success */
-    rv = 1;
-
-out:
-    unmap_domain_page(table);
-    return rv;
-}
-
-static mfn_t
-p2m_gfn_to_mfn(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, 
p2m_access_t *a,
-               p2m_query_t q)
-{
-    mfn_t mfn;
-    paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
-    l2_pgentry_t *l2e;
-    l1_pgentry_t *l1e;
-
-    ASSERT(paging_mode_translate(p2m->domain));
-
-    /* XXX This is for compatibility with the old model, where anything not 
-     * XXX marked as RAM was considered to be emulated MMIO space.
-     * XXX Once we start explicitly registering MMIO regions in the p2m 
-     * XXX we will return p2m_invalid for unmapped gfns */
-    *t = p2m_mmio_dm;
-    /* Not implemented except with EPT */
-    *a = p2m_access_rwx; 
-
-    mfn = pagetable_get_mfn(p2m_get_pagetable(p2m));
-
-    if ( gfn > p2m->max_mapped_pfn )
-        /* This pfn is higher than the highest the p2m map currently holds */
-        return _mfn(INVALID_MFN);
-
-#if CONFIG_PAGING_LEVELS >= 4
-    {
-        l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn));
-        l4e += l4_table_offset(addr);
-        if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
-        {
-            unmap_domain_page(l4e);
-            return _mfn(INVALID_MFN);
-        }
-        mfn = _mfn(l4e_get_pfn(*l4e));
-        unmap_domain_page(l4e);
-    }
-#endif
-    {
-        l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn));
-#if CONFIG_PAGING_LEVELS == 3
-        /* On PAE hosts the p2m has eight l3 entries, not four (see
-         * shadow_set_p2m_entry()) so we can't use l3_table_offset.
-         * Instead, just count the number of l3es from zero.  It's safe
-         * to do this because we already checked that the gfn is within
-         * the bounds of the p2m. */
-        l3e += (addr >> L3_PAGETABLE_SHIFT);
-#else
-        l3e += l3_table_offset(addr);
-#endif
-pod_retry_l3:
-        if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
-        {
-            if ( p2m_flags_to_type(l3e_get_flags(*l3e)) == 
p2m_populate_on_demand )
-            {
-                if ( q != p2m_query )
-                {
-                    if ( !p2m_pod_demand_populate(p2m, gfn, 18, q) )
-                        goto pod_retry_l3;
-                }
-                else
-                    *t = p2m_populate_on_demand;
-            }
-            unmap_domain_page(l3e);
-            return _mfn(INVALID_MFN);
-        }
-        else if ( (l3e_get_flags(*l3e) & _PAGE_PSE) )
-        {
-            mfn = _mfn(l3e_get_pfn(*l3e) +
-                       l2_table_offset(addr) * L1_PAGETABLE_ENTRIES +
-                       l1_table_offset(addr));
-            *t = p2m_flags_to_type(l3e_get_flags(*l3e));
-            unmap_domain_page(l3e);
-
-            ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
-            return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
-        }
-
-        mfn = _mfn(l3e_get_pfn(*l3e));
-        unmap_domain_page(l3e);
-    }
-
-    l2e = map_domain_page(mfn_x(mfn));
-    l2e += l2_table_offset(addr);
-
-pod_retry_l2:
-    if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
-    {
-        /* PoD: Try to populate a 2-meg chunk */
-        if ( p2m_flags_to_type(l2e_get_flags(*l2e)) == p2m_populate_on_demand )
-        {
-            if ( q != p2m_query ) {
-                if ( !p2m_pod_check_and_populate(p2m, gfn,
-                                                 (l1_pgentry_t *)l2e, 9, q) )
-                    goto pod_retry_l2;
-            } else
-                *t = p2m_populate_on_demand;
-        }
-    
-        unmap_domain_page(l2e);
-        return _mfn(INVALID_MFN);
-    }
-    else if ( (l2e_get_flags(*l2e) & _PAGE_PSE) )
-    {
-        mfn = _mfn(l2e_get_pfn(*l2e) + l1_table_offset(addr));
-        *t = p2m_flags_to_type(l2e_get_flags(*l2e));
-        unmap_domain_page(l2e);
-        
-        ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
-        return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
-    }
-
-    mfn = _mfn(l2e_get_pfn(*l2e));
-    unmap_domain_page(l2e);
-
-    l1e = map_domain_page(mfn_x(mfn));
-    l1e += l1_table_offset(addr);
-pod_retry_l1:
-    if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
-    {
-        /* PoD: Try to populate */
-        if ( p2m_flags_to_type(l1e_get_flags(*l1e)) == p2m_populate_on_demand )
-        {
-            if ( q != p2m_query ) {
-                if ( !p2m_pod_check_and_populate(p2m, gfn,
-                                                 (l1_pgentry_t *)l1e, 0, q) )
-                    goto pod_retry_l1;
-            } else
-                *t = p2m_populate_on_demand;
-        }
-    
-        unmap_domain_page(l1e);
-        return _mfn(INVALID_MFN);
-    }
-    mfn = _mfn(l1e_get_pfn(*l1e));
-    *t = p2m_flags_to_type(l1e_get_flags(*l1e));
-    unmap_domain_page(l1e);
-
-    ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
-    return (p2m_is_valid(*t) || p2m_is_grant(*t)) ? mfn : _mfn(INVALID_MFN);
-}
-
-/* Read the current domain's p2m table (through the linear mapping). */
-static mfn_t p2m_gfn_to_mfn_current(struct p2m_domain *p2m,
-                                    unsigned long gfn, p2m_type_t *t, 
p2m_access_t *a,
-                                    p2m_query_t q)
-{
-    mfn_t mfn = _mfn(INVALID_MFN);
-    p2m_type_t p2mt = p2m_mmio_dm;
-    paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
-    /* XXX This is for compatibility with the old model, where anything not 
-     * XXX marked as RAM was considered to be emulated MMIO space.
-     * XXX Once we start explicitly registering MMIO regions in the p2m 
-     * XXX we will return p2m_invalid for unmapped gfns */
-
-    /* Not currently implemented except for EPT */
-    *a = p2m_access_rwx;
-
-    if ( gfn <= p2m->max_mapped_pfn )
-    {
-        l1_pgentry_t l1e = l1e_empty(), *p2m_entry;
-        l2_pgentry_t l2e = l2e_empty();
-        int ret;
-#if CONFIG_PAGING_LEVELS >= 4
-        l3_pgentry_t l3e = l3e_empty();
-#endif
-
-        ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START) 
-               / sizeof(l1_pgentry_t));
-
-#if CONFIG_PAGING_LEVELS >= 4
-        /*
-         * Read & process L3
-         */
-        p2m_entry = (l1_pgentry_t *)
-            &__linear_l2_table[l2_linear_offset(RO_MPT_VIRT_START)
-                               + l3_linear_offset(addr)];
-    pod_retry_l3:
-        ret = __copy_from_user(&l3e, p2m_entry, sizeof(l3e));
-
-        if ( ret != 0 || !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
-        {
-            if ( (l3e_get_flags(l3e) & _PAGE_PSE) &&
-                 (p2m_flags_to_type(l3e_get_flags(l3e)) == 
p2m_populate_on_demand) )
-            {
-                /* The read has succeeded, so we know that mapping exists */
-                if ( q != p2m_query )
-                {
-                    if ( !p2m_pod_demand_populate(p2m, gfn, 18, q) )
-                        goto pod_retry_l3;
-                    p2mt = p2m_invalid;
-                    printk("%s: Allocate 1GB failed!\n", __func__);
-                    goto out;
-                }
-                else
-                {
-                    p2mt = p2m_populate_on_demand;
-                    goto out;
-                }
-            }
-            goto pod_retry_l2;
-        }
-
-        if ( l3e_get_flags(l3e) & _PAGE_PSE )
-        {
-            p2mt = p2m_flags_to_type(l3e_get_flags(l3e));
-            ASSERT(l3e_get_pfn(l3e) != INVALID_MFN || !p2m_is_ram(p2mt));
-            if (p2m_is_valid(p2mt) )
-                mfn = _mfn(l3e_get_pfn(l3e) + 
-                           l2_table_offset(addr) * L1_PAGETABLE_ENTRIES + 
-                           l1_table_offset(addr));
-            else
-                p2mt = p2m_mmio_dm;
-            
-            goto out;
-        }
-#endif
-        /*
-         * Read & process L2
-         */
-        p2m_entry = &__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START)
-                                       + l2_linear_offset(addr)];
-
-    pod_retry_l2:
-        ret = __copy_from_user(&l2e,
-                               p2m_entry,
-                               sizeof(l2e));
-        if ( ret != 0
-             || !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
-        {
-            if( (l2e_get_flags(l2e) & _PAGE_PSE)
-                && ( p2m_flags_to_type(l2e_get_flags(l2e))
-                     == p2m_populate_on_demand ) )
-            {
-                /* The read has succeeded, so we know that the mapping
-                 * exits at this point.  */
-                if ( q != p2m_query )
-                {
-                    if ( !p2m_pod_check_and_populate(p2m, gfn,
-                                                     p2m_entry, 9, q) )
-                        goto pod_retry_l2;
-
-                    /* Allocate failed. */
-                    p2mt = p2m_invalid;
-                    printk("%s: Allocate failed!\n", __func__);
-                    goto out;
-                }
-                else
-                {
-                    p2mt = p2m_populate_on_demand;
-                    goto out;
-                }
-            }
-
-            goto pod_retry_l1;
-        }
-        
-        if (l2e_get_flags(l2e) & _PAGE_PSE)
-        {
-            p2mt = p2m_flags_to_type(l2e_get_flags(l2e));
-            ASSERT(l2e_get_pfn(l2e) != INVALID_MFN || !p2m_is_ram(p2mt));
-
-            if ( p2m_is_valid(p2mt) )
-                mfn = _mfn(l2e_get_pfn(l2e) + l1_table_offset(addr));
-            else
-                p2mt = p2m_mmio_dm;
-
-            goto out;
-        }
-
-        /*
-         * Read and process L1
-         */
-
-        /* Need to __copy_from_user because the p2m is sparse and this
-         * part might not exist */
-    pod_retry_l1:
-        p2m_entry = &phys_to_machine_mapping[gfn];
-
-        ret = __copy_from_user(&l1e,
-                               p2m_entry,
-                               sizeof(l1e));
-            
-        if ( ret == 0 ) {
-            p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
-            ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
-
-            if ( p2m_flags_to_type(l1e_get_flags(l1e))
-                 == p2m_populate_on_demand )
-            {
-                /* The read has succeeded, so we know that the mapping
-                 * exits at this point.  */
-                if ( q != p2m_query )
-                {
-                    if ( !p2m_pod_check_and_populate(p2m, gfn,
-                                                     (l1_pgentry_t 
*)p2m_entry, 0, q) )
-                        goto pod_retry_l1;
-
-                    /* Allocate failed. */
-                    p2mt = p2m_invalid;
-                    goto out;
-                }
-                else
-                {
-                    p2mt = p2m_populate_on_demand;
-                    goto out;
-                }
-            }
-
-            if ( p2m_is_valid(p2mt) || p2m_is_grant(p2mt) )
-                mfn = _mfn(l1e_get_pfn(l1e));
-            else 
-                /* XXX see above */
-                p2mt = p2m_mmio_dm;
-        }
-    }
-out:
-    *t = p2mt;
-    return mfn;
-}
+/* XXX declare functions moved to p2m-pt.c */
+extern void p2m_pt_init(struct p2m_domain *p2m);
 
 /* Init the datastructures for later use by the p2m code */
 static void p2m_initialise(struct domain *d, struct p2m_domain *p2m)
@@ -1930,15 +87,12 @@ static void p2m_initialise(struct domain
     p2m->default_access = p2m_access_rwx;
 
     p2m->cr3 = CR3_EADDR;
-    p2m->set_entry = p2m_set_entry;
-    p2m->get_entry = p2m_gfn_to_mfn;
-    p2m->get_entry_current = p2m_gfn_to_mfn_current;
-    p2m->change_entry_type_global = p2m_change_type_global;
-    p2m->write_p2m_entry = paging_write_p2m_entry;
     cpus_clear(p2m->p2m_dirty_cpumask);
 
     if ( hap_enabled(d) && (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) )
         ept_p2m_init(d);
+    else
+        p2m_pt_init(p2m);
 
     return;
 }
@@ -1986,7 +140,6 @@ void p2m_change_entry_type_global(struct
     p2m_unlock(p2m);
 }
 
-static
 int set_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, 
                   unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma)
 {
@@ -2162,275 +315,6 @@ void p2m_final_teardown(struct domain *d
     p2m_teardown_nestedp2m(d);
 }
 
-#if P2M_AUDIT
-/* strict_m2p == 0 allows m2p mappings that don'#t match the p2m. 
- * It's intended for add_to_physmap, when the domain has just been allocated 
- * new mfns that might have stale m2p entries from previous owners */
-static void audit_p2m(struct p2m_domain *p2m, int strict_m2p)
-{
-    struct page_info *page;
-    struct domain *od;
-    unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
-    int entry_count = 0;
-    mfn_t p2mfn;
-    unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
-    int test_linear;
-    p2m_type_t type;
-    struct domain *d = p2m->domain;
-
-    if ( !paging_mode_translate(d) )
-        return;
-
-    //P2M_PRINTK("p2m audit starts\n");
-
-    test_linear = ( (d == current->domain)
-                    && !pagetable_is_null(current->arch.monitor_table) );
-    if ( test_linear )
-        flush_tlb_local();
-
-    spin_lock(&d->page_alloc_lock);
-
-    /* Audit part one: walk the domain's page allocation list, checking
-     * the m2p entries. */
-    page_list_for_each ( page, &d->page_list )
-    {
-        mfn = mfn_x(page_to_mfn(page));
-
-        // P2M_PRINTK("auditing guest page, mfn=%#lx\n", mfn);
-
-        od = page_get_owner(page);
-
-        if ( od != d )
-        {
-            P2M_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n",
-                       mfn, od, (od?od->domain_id:-1), d, d->domain_id);
-            continue;
-        }
-
-        gfn = get_gpfn_from_mfn(mfn);
-        if ( gfn == INVALID_M2P_ENTRY )
-        {
-            orphans_i++;
-            //P2M_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n",
-            //               mfn);
-            continue;
-        }
-
-        if ( gfn == 0x55555555 || gfn == 0x5555555555555555 )
-        {
-            orphans_d++;
-            //P2M_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n",
-            //               mfn);
-            continue;
-        }
-
-        if ( gfn == SHARED_M2P_ENTRY )
-        {
-            P2M_PRINTK("shared mfn (%lx) on domain page list!\n",
-                    mfn);
-            continue;
-        }
-
-        p2mfn = gfn_to_mfn_type_p2m(p2m, gfn, &type, p2m_query);
-        if ( strict_m2p && mfn_x(p2mfn) != mfn )
-        {
-            mpbad++;
-            P2M_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx"
-                       " (-> gfn %#lx)\n",
-                       mfn, gfn, mfn_x(p2mfn),
-                       (mfn_valid(p2mfn)
-                        ? get_gpfn_from_mfn(mfn_x(p2mfn))
-                        : -1u));
-            /* This m2p entry is stale: the domain has another frame in
-             * this physical slot.  No great disaster, but for neatness,
-             * blow away the m2p entry. */
-            set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
-        }
-
-        if ( test_linear && (gfn <= p2m->max_mapped_pfn) )
-        {
-            lp2mfn = mfn_x(gfn_to_mfn_query(p2m, gfn, &type));
-            if ( lp2mfn != mfn_x(p2mfn) )
-            {
-                P2M_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
-                           "(!= mfn %#lx)\n", gfn, lp2mfn, mfn_x(p2mfn));
-            }
-        }
-
-        // P2M_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n",
-        //                mfn, gfn, mfn_x(p2mfn), lp2mfn);
-    }
-
-    spin_unlock(&d->page_alloc_lock);
-
-    /* Audit part two: walk the domain's p2m table, checking the entries. */
-    if ( pagetable_get_pfn(p2m_get_pagetable(p2m)) != 0 )
-    {
-        l2_pgentry_t *l2e;
-        l1_pgentry_t *l1e;
-        int i1, i2;
-
-#if CONFIG_PAGING_LEVELS == 4
-        l4_pgentry_t *l4e;
-        l3_pgentry_t *l3e;
-        int i4, i3;
-        l4e = 
map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
-#else /* CONFIG_PAGING_LEVELS == 3 */
-        l3_pgentry_t *l3e;
-        int i3;
-        l3e = 
map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
-#endif
-
-        gfn = 0;
-#if CONFIG_PAGING_LEVELS >= 4
-        for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
-        {
-            if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
-            {
-                gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
-                continue;
-            }
-            l3e = map_domain_page(mfn_x(_mfn(l4e_get_pfn(l4e[i4]))));
-#endif
-            for ( i3 = 0;
-                  i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8);
-                  i3++ )
-            {
-                if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
-                {
-                    gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
-                    continue;
-                }
-
-                /* check for 1GB super page */
-                if ( l3e_get_flags(l3e[i3]) & _PAGE_PSE )
-                {
-                    mfn = l3e_get_pfn(l3e[i3]);
-                    ASSERT(mfn_valid(_mfn(mfn)));
-                    /* we have to cover 512x512 4K pages */
-                    for ( i2 = 0; 
-                          i2 < (L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES);
-                          i2++)
-                    {
-                        m2pfn = get_gpfn_from_mfn(mfn+i2);
-                        if ( m2pfn != (gfn + i2) )
-                        {
-                            pmbad++;
-                            P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
-                                       " -> gfn %#lx\n", gfn+i2, mfn+i2,
-                                       m2pfn);
-                            BUG();
-                        }
-                        gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
-                        continue;
-                    }
-                }
-
-                l2e = map_domain_page(mfn_x(_mfn(l3e_get_pfn(l3e[i3]))));
-                for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
-                {
-                    if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
-                    {
-                        if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE)
-                             && ( p2m_flags_to_type(l2e_get_flags(l2e[i2]))
-                                  == p2m_populate_on_demand ) )
-                            entry_count+=SUPERPAGE_PAGES;
-                        gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
-                        continue;
-                    }
-                    
-                    /* check for super page */
-                    if ( l2e_get_flags(l2e[i2]) & _PAGE_PSE )
-                    {
-                        mfn = l2e_get_pfn(l2e[i2]);
-                        ASSERT(mfn_valid(_mfn(mfn)));
-                        for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++)
-                        {
-                            m2pfn = get_gpfn_from_mfn(mfn+i1);
-                            /* Allow shared M2Ps */
-                            if ( (m2pfn != (gfn + i1)) &&
-                                 (m2pfn != SHARED_M2P_ENTRY) )
-                            {
-                                pmbad++;
-                                P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
-                                           " -> gfn %#lx\n", gfn+i1, mfn+i1,
-                                           m2pfn);
-                                BUG();
-                            }
-                        }
-                        gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
-                        continue;
-                    }
-
-                    l1e = map_domain_page(mfn_x(_mfn(l2e_get_pfn(l2e[i2]))));
-
-                    for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
-                    {
-                        p2m_type_t type;
-
-                        type = p2m_flags_to_type(l1e_get_flags(l1e[i1]));
-                        if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
-                        {
-                            if ( type == p2m_populate_on_demand )
-                                entry_count++;
-                            continue;
-                        }
-                        mfn = l1e_get_pfn(l1e[i1]);
-                        ASSERT(mfn_valid(_mfn(mfn)));
-                        m2pfn = get_gpfn_from_mfn(mfn);
-                        if ( m2pfn != gfn &&
-                             type != p2m_mmio_direct &&
-                             !p2m_is_grant(type) &&
-                             !p2m_is_shared(type) )
-                        {
-                            pmbad++;
-                            printk("mismatch: gfn %#lx -> mfn %#lx"
-                                   " -> gfn %#lx\n", gfn, mfn, m2pfn);
-                            P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
-                                       " -> gfn %#lx\n", gfn, mfn, m2pfn);
-                            BUG();
-                        }
-                    }
-                    unmap_domain_page(l1e);
-                }
-                unmap_domain_page(l2e);
-            }
-#if CONFIG_PAGING_LEVELS >= 4
-            unmap_domain_page(l3e);
-        }
-#endif
-
-#if CONFIG_PAGING_LEVELS == 4
-        unmap_domain_page(l4e);
-#else /* CONFIG_PAGING_LEVELS == 3 */
-        unmap_domain_page(l3e);
-#endif
-
-    }
-
-    if ( entry_count != p2m->pod.entry_count )
-    {
-        printk("%s: refcounted entry count %d, audit count %d!\n",
-               __func__,
-               p2m->pod.entry_count,
-               entry_count);
-        BUG();
-    }
-        
-    //P2M_PRINTK("p2m audit complete\n");
-    //if ( orphans_i | orphans_d | mpbad | pmbad )
-    //    P2M_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
-    //                   orphans_i + orphans_d, orphans_i, orphans_d);
-    if ( mpbad | pmbad )
-    {
-        P2M_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
-                   pmbad, mpbad);
-        WARN();
-    }
-}
-#endif /* P2M_AUDIT */
-
-
 
 static void
 p2m_remove_page(struct p2m_domain *p2m, unsigned long gfn, unsigned long mfn,
@@ -2475,88 +359,6 @@ guest_physmap_remove_entry(struct p2m_do
     p2m_unlock(p2m);
 }
 
-#if CONFIG_PAGING_LEVELS == 3
-static int gfn_check_limit(
-    struct domain *d, unsigned long gfn, unsigned int order)
-{
-    /*
-     * 32bit AMD nested paging does not support over 4GB guest due to 
-     * hardware translation limit. This limitation is checked by comparing
-     * gfn with 0xfffffUL.
-     */
-    if ( !hap_enabled(d) || ((gfn + (1ul << order)) <= 0x100000UL) ||
-         (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) )
-        return 0;
-
-    if ( !test_and_set_bool(d->arch.hvm_domain.svm.npt_4gb_warning) )
-        dprintk(XENLOG_WARNING, "Dom%d failed to populate memory beyond"
-                " 4GB: specify 'hap=0' domain config option.\n",
-                d->domain_id);
-
-    return -EINVAL;
-}
-#else
-#define gfn_check_limit(d, g, o) 0
-#endif
-
-int
-guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
-                                      unsigned int order)
-{
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-    unsigned long i;
-    p2m_type_t ot;
-    mfn_t omfn;
-    int pod_count = 0;
-    int rc = 0;
-
-    BUG_ON(!paging_mode_translate(d));
-
-    rc = gfn_check_limit(d, gfn, order);
-    if ( rc != 0 )
-        return rc;
-
-    p2m_lock(p2m);
-    audit_p2m(p2m, 1);
-
-    P2M_DEBUG("mark pod gfn=%#lx\n", gfn);
-
-    /* Make sure all gpfns are unused */
-    for ( i = 0; i < (1UL << order); i++ )
-    {
-        omfn = gfn_to_mfn_query(p2m, gfn + i, &ot);
-        if ( p2m_is_ram(ot) )
-        {
-            printk("%s: gfn_to_mfn returned type %d!\n",
-                   __func__, ot);
-            rc = -EBUSY;
-            goto out;
-        }
-        else if ( ot == p2m_populate_on_demand )
-        {
-            /* Count how man PoD entries we'll be replacing if successful */
-            pod_count++;
-        }
-    }
-
-    /* Now, actually do the two-way mapping */
-    if ( !set_p2m_entry(p2m, gfn, _mfn(POPULATE_ON_DEMAND_MFN), order,
-                        p2m_populate_on_demand, p2m->default_access) )
-        rc = -EINVAL;
-    else
-    {
-        p2m->pod.entry_count += 1 << order; /* Lock: p2m */
-        p2m->pod.entry_count -= pod_count;
-        BUG_ON(p2m->pod.entry_count < 0);
-    }
-
-    audit_p2m(p2m, 1);
-    p2m_unlock(p2m);
-
-out:
-    return rc;
-}
-
 int
 guest_physmap_add_entry(struct p2m_domain *p2m, unsigned long gfn,
                         unsigned long mfn, unsigned int page_order, 
@@ -2588,7 +390,7 @@ guest_physmap_add_entry(struct p2m_domai
         return 0;
     }
 
-    rc = gfn_check_limit(d, gfn, page_order);
+    rc = p2m_gfn_check_limit(d, gfn, page_order);
     if ( rc != 0 )
         return rc;
 
@@ -2682,142 +484,6 @@ guest_physmap_add_entry(struct p2m_domai
     return rc;
 }
 
-/* Walk the whole p2m table, changing any entries of the old type
- * to the new type.  This is used in hardware-assisted paging to 
- * quickly enable or diable log-dirty tracking */
-void p2m_change_type_global(struct p2m_domain *p2m, p2m_type_t ot, p2m_type_t 
nt)
-{
-    unsigned long mfn, gfn, flags;
-    l1_pgentry_t l1e_content;
-    l1_pgentry_t *l1e;
-    l2_pgentry_t *l2e;
-    mfn_t l1mfn, l2mfn, l3mfn;
-    unsigned long i1, i2, i3;
-    l3_pgentry_t *l3e;
-#if CONFIG_PAGING_LEVELS == 4
-    l4_pgentry_t *l4e;
-    unsigned long i4;
-#endif /* CONFIG_PAGING_LEVELS == 4 */
-
-    BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
-    BUG_ON(ot != nt && (ot == p2m_mmio_direct || nt == p2m_mmio_direct));
-
-    if ( !paging_mode_translate(p2m->domain) )
-        return;
-
-    if ( pagetable_get_pfn(p2m_get_pagetable(p2m)) == 0 )
-        return;
-
-    ASSERT(p2m_locked_by_me(p2m));
-
-#if CONFIG_PAGING_LEVELS == 4
-    l4e = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
-#else /* CONFIG_PAGING_LEVELS == 3 */
-    l3mfn = _mfn(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
-    l3e = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 4
-    for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
-    {
-        if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
-        {
-            continue;
-        }
-        l3mfn = _mfn(l4e_get_pfn(l4e[i4]));
-        l3e = map_domain_page(l4e_get_pfn(l4e[i4]));
-#endif
-        for ( i3 = 0;
-              i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8);
-              i3++ )
-        {
-            if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
-            {
-                continue;
-            }
-            if ( (l3e_get_flags(l3e[i3]) & _PAGE_PSE) )
-            {
-                flags = l3e_get_flags(l3e[i3]);
-                if ( p2m_flags_to_type(flags) != ot )
-                    continue;
-                mfn = l3e_get_pfn(l3e[i3]);
-                gfn = get_gpfn_from_mfn(mfn);
-                flags = p2m_type_to_flags(nt, _mfn(mfn));
-                l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
-                p2m->write_p2m_entry(p2m, gfn,
-                                     (l1_pgentry_t *)&l3e[i3],
-                                     l3mfn, l1e_content, 3);
-                continue;
-            }
-
-            l2mfn = _mfn(l3e_get_pfn(l3e[i3]));
-            l2e = map_domain_page(l3e_get_pfn(l3e[i3]));
-            for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
-            {
-                if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
-                {
-                    continue;
-                }
-
-                if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE) )
-                {
-                    flags = l2e_get_flags(l2e[i2]);
-                    if ( p2m_flags_to_type(flags) != ot )
-                        continue;
-                    mfn = l2e_get_pfn(l2e[i2]);
-                    /* Do not use get_gpfn_from_mfn because it may return 
-                       SHARED_M2P_ENTRY */
-                    gfn = (i2 + (i3
-#if CONFIG_PAGING_LEVELS >= 4
-                                  + (i4 * L3_PAGETABLE_ENTRIES)
-#endif
-                               )
-                           * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; 
-                    flags = p2m_type_to_flags(nt, _mfn(mfn));
-                    l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
-                    p2m->write_p2m_entry(p2m, gfn,
-                                         (l1_pgentry_t *)&l2e[i2],
-                                         l2mfn, l1e_content, 2);
-                    continue;
-                }
-
-                l1mfn = _mfn(l2e_get_pfn(l2e[i2]));
-                l1e = map_domain_page(mfn_x(l1mfn));
-
-                for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
-                {
-                    flags = l1e_get_flags(l1e[i1]);
-                    if ( p2m_flags_to_type(flags) != ot )
-                        continue;
-                    mfn = l1e_get_pfn(l1e[i1]);
-                    gfn = i1 + (i2 + (i3
-#if CONFIG_PAGING_LEVELS >= 4
-                                       + (i4 * L3_PAGETABLE_ENTRIES)
-#endif
-                                    )
-                           * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; 
-                    /* create a new 1le entry with the new type */
-                    flags = p2m_type_to_flags(nt, _mfn(mfn));
-                    l1e_content = l1e_from_pfn(mfn, flags);
-                    p2m->write_p2m_entry(p2m, gfn, &l1e[i1],
-                                         l1mfn, l1e_content, 1);
-                }
-                unmap_domain_page(l1e);
-            }
-            unmap_domain_page(l2e);
-        }
-#if CONFIG_PAGING_LEVELS >= 4
-        unmap_domain_page(l3e);
-    }
-#endif
-
-#if CONFIG_PAGING_LEVELS == 4
-    unmap_domain_page(l4e);
-#else /* CONFIG_PAGING_LEVELS == 3 */
-    unmap_domain_page(l3e);
-#endif
-
-}
 
 /* Modify the p2m type of a single gfn from ot to nt, returning the 
  * entry's previous type.  Resets the access permissions. */
diff -r 4b0692880dfa -r 26c4beb6b520 xen/include/asm-x86/p2m.h
--- a/xen/include/asm-x86/p2m.h Thu May 05 17:40:34 2011 +0100
+++ b/xen/include/asm-x86/p2m.h Fri May 06 11:15:35 2011 +0100
@@ -638,6 +638,34 @@ static inline void p2m_mem_access_check(
 struct page_info *p2m_alloc_ptp(struct p2m_domain *p2m, unsigned long type);
 void p2m_free_ptp(struct p2m_domain *p2m, struct page_info *pg);
 
+#if CONFIG_PAGING_LEVELS == 3
+static inline int p2m_gfn_check_limit(
+    struct domain *d, unsigned long gfn, unsigned int order)
+{
+    /*
+     * 32bit AMD nested paging does not support over 4GB guest due to 
+     * hardware translation limit. This limitation is checked by comparing
+     * gfn with 0xfffffUL.
+     */
+    if ( !hap_enabled(d) || ((gfn + (1ul << order)) <= 0x100000UL) ||
+         (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) )
+        return 0;
+
+    if ( !test_and_set_bool(d->arch.hvm_domain.svm.npt_4gb_warning) )
+        dprintk(XENLOG_WARNING, "Dom%d failed to populate memory beyond"
+                " 4GB: specify 'hap=0' domain config option.\n",
+                d->domain_id);
+
+    return -EINVAL;
+}
+#else
+#define p2m_gfn_check_limit(d, g, o) 0
+#endif
+
+/* Directly set a p2m entry: only for use by p2m code */
+int set_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, 
+                  unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma);
+
 #endif /* _XEN_P2M_H */
 
 /*

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

<Prev in Thread] Current Thread [Next in Thread>