[Xen-devel] [PATCH] x86/mm/p2m: break into common, pt-implementa

# HG changeset patch
# User Tim Deegan <Tim.Deegan@xxxxxxxxxx>
# Date 1304676935 -3600
# Node ID 26c4beb6b520733883eb1fb2aac8701de9188e08
# Parent  4b0692880dfa557d4e1537c7a58c412c1286a416
x86/mm/p2m: break into common, pt-implementation and pod parts.

Start to make a clearer distinction between generic p2m functions and
the implementation of the datastructure as an x86 pagetable.
Also move the EPT datastructure implementation into x86/mm/ to match,
and split the PoD admin code into its own file.

This is just code motion, except for splitting the p2m_initialise
function into a pt-specific part and a common part.

Signed-off-by: Tim Deegan <Tim.Deegan@xxxxxxxxxx>

diff -r 4b0692880dfa -r 26c4beb6b520 xen/arch/x86/mm/Makefile
--- a/xen/arch/x86/mm/Makefile  Thu May 05 17:40:34 2011 +0100
+++ b/xen/arch/x86/mm/Makefile  Fri May 06 11:15:35 2011 +0100
@@ -2,7 +2,7 @@ subdir-y += shadow
 subdir-y += hap
 
 obj-y += paging.o
-obj-y += p2m.o
+obj-y += p2m.o p2m-pt.o p2m-ept.o p2m-pod.o
 obj-y += guest_walk_2.o
 obj-y += guest_walk_3.o
 obj-$(x86_64) += guest_walk_4.o
diff -r 4b0692880dfa -r 26c4beb6b520 xen/arch/x86/mm/hap/Makefile
--- a/xen/arch/x86/mm/hap/Makefile      Thu May 05 17:40:34 2011 +0100
+++ b/xen/arch/x86/mm/hap/Makefile      Fri May 06 11:15:35 2011 +0100
@@ -2,7 +2,6 @@ obj-y += hap.o
 obj-y += guest_walk_2level.o
 obj-y += guest_walk_3level.o
 obj-$(x86_64) += guest_walk_4level.o
-obj-y += p2m-ept.o
 obj-y += nested_hap.o
 
 guest_walk_%level.o: guest_walk.c Makefile
diff -r 4b0692880dfa -r 26c4beb6b520 xen/arch/x86/mm/hap/p2m-ept.c
--- a/xen/arch/x86/mm/hap/p2m-ept.c     Thu May 05 17:40:34 2011 +0100
+++ /dev/null   Thu Jan 01 00:00:00 1970 +0000
@@ -1,911 +0,0 @@
-/*
- * ept-p2m.c: use the EPT page table as p2m
- * Copyright (c) 2007, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- */
-
-#include <xen/config.h>
-#include <xen/domain_page.h>
-#include <xen/sched.h>
-#include <asm/current.h>
-#include <asm/paging.h>
-#include <asm/types.h>
-#include <asm/domain.h>
-#include <asm/p2m.h>
-#include <asm/hvm/vmx/vmx.h>
-#include <asm/hvm/vmx/vmcs.h>
-#include <xen/iommu.h>
-#include <asm/mtrr.h>
-#include <asm/hvm/cacheattr.h>
-#include <xen/keyhandler.h>
-#include <xen/softirq.h>
-
-#define atomic_read_ept_entry(__pepte)                              \
-    ( (ept_entry_t) { .epte = atomic_read64(&(__pepte)->epte) } )
-#define atomic_write_ept_entry(__pepte, __epte)                     \
-    atomic_write64(&(__pepte)->epte, (__epte).epte)
-
-#define is_epte_present(ept_entry)      ((ept_entry)->epte & 0x7)
-#define is_epte_superpage(ept_entry)    ((ept_entry)->sp)
-
-/* Non-ept "lock-and-check" wrapper */
-static int ept_pod_check_and_populate(struct p2m_domain *p2m, unsigned long 
gfn,
-                                      ept_entry_t *entry, int order,
-                                      p2m_query_t q)
-{
-    /* Only take the lock if we don't already have it.  Otherwise it
-     * wouldn't be safe to do p2m lookups with the p2m lock held */
-    int do_locking = !p2m_locked_by_me(p2m);
-    int r;
-
-    if ( do_locking )
-        p2m_lock(p2m);
-
-    /* Check to make sure this is still PoD */
-    if ( entry->sa_p2mt != p2m_populate_on_demand )
-    {
-        if ( do_locking )
-            p2m_unlock(p2m);
-        return 0;
-    }
-
-    r = p2m_pod_demand_populate(p2m, gfn, order, q);
-
-    if ( do_locking )
-        p2m_unlock(p2m);
-
-    return r;
-}
-
-static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type, 
p2m_access_t access)
-{
-    /* First apply type permissions */
-    switch(type)
-    {
-        case p2m_invalid:
-        case p2m_mmio_dm:
-        case p2m_populate_on_demand:
-        case p2m_ram_paging_out:
-        case p2m_ram_paged:
-        case p2m_ram_paging_in:
-        case p2m_ram_paging_in_start:
-        default:
-            entry->r = entry->w = entry->x = 0;
-            break;
-        case p2m_ram_rw:
-            entry->r = entry->w = entry->x = 1;
-            break;
-        case p2m_mmio_direct:
-            entry->r = entry->x = 1;
-            entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
-                                                    entry->mfn);
-            break;
-        case p2m_ram_logdirty:
-        case p2m_ram_ro:
-        case p2m_ram_shared:
-            entry->r = entry->x = 1;
-            entry->w = 0;
-            break;
-        case p2m_grant_map_rw:
-            entry->r = entry->w = 1;
-            entry->x = 0;
-            break;
-        case p2m_grant_map_ro:
-            entry->r = 1;
-            entry->w = entry->x = 0;
-            break;
-    }
-
-
-    /* Then restrict with access permissions */
-    switch (access) 
-    {
-        case p2m_access_n:
-            entry->r = entry->w = entry->x = 0;
-            break;
-        case p2m_access_r:
-            entry->w = entry->x = 0;
-            break;
-        case p2m_access_w:
-            entry->r = entry->x = 0;
-            break;
-        case p2m_access_x:
-            entry->r = entry->w = 0;
-            break;
-        case p2m_access_rx:
-        case p2m_access_rx2rw:
-            entry->w = 0;
-            break;
-        case p2m_access_wx:
-            entry->r = 0;
-            break;
-        case p2m_access_rw:
-            entry->x = 0;
-            break;           
-        case p2m_access_rwx:
-            break;
-    }
-    
-}
-
-#define GUEST_TABLE_MAP_FAILED  0
-#define GUEST_TABLE_NORMAL_PAGE 1
-#define GUEST_TABLE_SUPER_PAGE  2
-#define GUEST_TABLE_POD_PAGE    3
-
-/* Fill in middle levels of ept table */
-static int ept_set_middle_entry(struct p2m_domain *p2m, ept_entry_t *ept_entry)
-{
-    struct page_info *pg;
-
-    pg = p2m_alloc_ptp(p2m, 0);
-    if ( pg == NULL )
-        return 0;
-
-    ept_entry->epte = 0;
-    ept_entry->mfn = page_to_mfn(pg);
-    ept_entry->access = p2m->default_access;
-
-    ept_entry->r = ept_entry->w = ept_entry->x = 1;
-
-    return 1;
-}
-
-/* free ept sub tree behind an entry */
-void ept_free_entry(struct p2m_domain *p2m, ept_entry_t *ept_entry, int level)
-{
-    /* End if the entry is a leaf entry. */
-    if ( level == 0 || !is_epte_present(ept_entry) ||
-         is_epte_superpage(ept_entry) )
-        return;
-
-    if ( level > 1 )
-    {
-        ept_entry_t *epte = map_domain_page(ept_entry->mfn);
-        for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
-            ept_free_entry(p2m, epte + i, level - 1);
-        unmap_domain_page(epte);
-    }
-    
-    p2m_free_ptp(p2m, mfn_to_page(ept_entry->mfn));
-}
-
-static int ept_split_super_page(struct p2m_domain *p2m, ept_entry_t *ept_entry,
-                                int level, int target)
-{
-    ept_entry_t new_ept, *table;
-    uint64_t trunk;
-    int rv = 1;
-
-    /* End if the entry is a leaf entry or reaches the target level. */
-    if ( level == 0 || level == target )
-        return rv;
-
-    ASSERT(is_epte_superpage(ept_entry));
-
-    if ( !ept_set_middle_entry(p2m, &new_ept) )
-        return 0;
-
-    table = map_domain_page(new_ept.mfn);
-    trunk = 1UL << ((level - 1) * EPT_TABLE_ORDER);
-
-    for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
-    {
-        ept_entry_t *epte = table + i;
-
-        epte->epte = 0;
-        epte->emt = ept_entry->emt;
-        epte->ipat = ept_entry->ipat;
-        epte->sp = (level > 1) ? 1 : 0;
-        epte->access = ept_entry->access;
-        epte->sa_p2mt = ept_entry->sa_p2mt;
-        epte->mfn = ept_entry->mfn + i * trunk;
-        epte->rsvd2_snp = ( iommu_enabled && iommu_snoop ) ? 1 : 0;
-
-        ept_p2m_type_to_flags(epte, epte->sa_p2mt, epte->access);
-
-        if ( (level - 1) == target )
-            continue;
-
-        ASSERT(is_epte_superpage(epte));
-
-        if ( !(rv = ept_split_super_page(p2m, epte, level - 1, target)) )
-            break;
-    }
-
-    unmap_domain_page(table);
-
-    /* Even failed we should install the newly allocated ept page. */
-    *ept_entry = new_ept;
-
-    return rv;
-}
-
-/* Take the currently mapped table, find the corresponding gfn entry,
- * and map the next table, if available.  If the entry is empty
- * and read_only is set, 
- * Return values:
- *  0: Failed to map.  Either read_only was set and the entry was
- *   empty, or allocating a new page failed.
- *  GUEST_TABLE_NORMAL_PAGE: next level mapped normally
- *  GUEST_TABLE_SUPER_PAGE:
- *   The next entry points to a superpage, and caller indicates
- *   that they are going to the superpage level, or are only doing
- *   a read.
- *  GUEST_TABLE_POD:
- *   The next entry is marked populate-on-demand.
- */
-static int ept_next_level(struct p2m_domain *p2m, bool_t read_only,
-                          ept_entry_t **table, unsigned long *gfn_remainder,
-                          int next_level)
-{
-    unsigned long mfn;
-    ept_entry_t *ept_entry, e;
-    u32 shift, index;
-
-    shift = next_level * EPT_TABLE_ORDER;
-
-    index = *gfn_remainder >> shift;
-
-    /* index must be falling into the page */
-    ASSERT(index < EPT_PAGETABLE_ENTRIES);
-
-    ept_entry = (*table) + index;
-
-    /* ept_next_level() is called (sometimes) without a lock.  Read
-     * the entry once, and act on the "cached" entry after that to
-     * avoid races. */
-    e = atomic_read_ept_entry(ept_entry);
-
-    if ( !is_epte_present(&e) )
-    {
-        if ( e.sa_p2mt == p2m_populate_on_demand )
-            return GUEST_TABLE_POD_PAGE;
-
-        if ( read_only )
-            return GUEST_TABLE_MAP_FAILED;
-
-        if ( !ept_set_middle_entry(p2m, ept_entry) )
-            return GUEST_TABLE_MAP_FAILED;
-        else
-            e = atomic_read_ept_entry(ept_entry); /* Refresh */
-    }
-
-    /* The only time sp would be set here is if we had hit a superpage */
-    if ( is_epte_superpage(&e) )
-        return GUEST_TABLE_SUPER_PAGE;
-
-    mfn = e.mfn;
-    unmap_domain_page(*table);
-    *table = map_domain_page(mfn);
-    *gfn_remainder &= (1UL << shift) - 1;
-    return GUEST_TABLE_NORMAL_PAGE;
-}
-
-/*
- * ept_set_entry() computes 'need_modify_vtd_table' for itself,
- * by observing whether any gfn->mfn translations are modified.
- */
-static int
-ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, 
-              unsigned int order, p2m_type_t p2mt, p2m_access_t p2ma)
-{
-    ept_entry_t *table, *ept_entry = NULL;
-    unsigned long gfn_remainder = gfn;
-    unsigned long offset = 0;
-    u32 index;
-    int i, target = order / EPT_TABLE_ORDER;
-    int rv = 0;
-    int ret = 0;
-    bool_t direct_mmio = (p2mt == p2m_mmio_direct);
-    uint8_t ipat = 0;
-    int need_modify_vtd_table = 1;
-    int vtd_pte_present = 0;
-    int needs_sync = 1;
-    struct domain *d = p2m->domain;
-    ept_entry_t old_entry = { .epte = 0 };
-
-    /*
-     * the caller must make sure:
-     * 1. passing valid gfn and mfn at order boundary.
-     * 2. gfn not exceeding guest physical address width.
-     * 3. passing a valid order.
-     */
-    if ( ((gfn | mfn_x(mfn)) & ((1UL << order) - 1)) ||
-         ((u64)gfn >> ((ept_get_wl(d) + 1) * EPT_TABLE_ORDER)) ||
-         (order % EPT_TABLE_ORDER) )
-        return 0;
-
-    ASSERT((target == 2 && hvm_hap_has_1gb(d)) ||
-           (target == 1 && hvm_hap_has_2mb(d)) ||
-           (target == 0));
-
-    table = map_domain_page(ept_get_asr(d));
-
-    ASSERT(table != NULL);
-
-    for ( i = ept_get_wl(d); i > target; i-- )
-    {
-        ret = ept_next_level(p2m, 0, &table, &gfn_remainder, i);
-        if ( !ret )
-            goto out;
-        else if ( ret != GUEST_TABLE_NORMAL_PAGE )
-            break;
-    }
-
-    ASSERT(ret != GUEST_TABLE_POD_PAGE || i != target);
-
-    index = gfn_remainder >> (i * EPT_TABLE_ORDER);
-    offset = gfn_remainder & ((1UL << (i * EPT_TABLE_ORDER)) - 1);
-
-    ept_entry = table + index;
-
-    /* In case VT-d uses same page table, this flag is needed by VT-d */ 
-    vtd_pte_present = is_epte_present(ept_entry) ? 1 : 0;
-
-    /*
-     * If we're here with i > target, we must be at a leaf node, and
-     * we need to break up the superpage.
-     *
-     * If we're here with i == target and i > 0, we need to check to see
-     * if we're replacing a non-leaf entry (i.e., pointing to an N-1 table)
-     * with a leaf entry (a 1GiB or 2MiB page), and handle things 
appropriately.
-     */
-
-    if ( i == target )
-    {
-        /* We reached the target level. */
-        ept_entry_t new_entry = { .epte = 0 };
-
-        /* No need to flush if the old entry wasn't valid */
-        if ( !is_epte_present(ept_entry) )
-            needs_sync = 0;
-
-        /* If we're replacing a non-leaf entry with a leaf entry (1GiB or 
2MiB),
-         * the intermediate tables will be freed below after the ept flush
-         *
-         * Read-then-write is OK because we hold the p2m lock. */
-        old_entry = *ept_entry;
-
-        if ( mfn_valid(mfn_x(mfn)) || direct_mmio || p2m_is_paged(p2mt) ||
-             (p2mt == p2m_ram_paging_in_start) )
-        {
-            /* Construct the new entry, and then write it once */
-            new_entry.emt = epte_get_entry_emt(p2m->domain, gfn, mfn, &ipat,
-                                                direct_mmio);
-
-            new_entry.ipat = ipat;
-            new_entry.sp = order ? 1 : 0;
-            new_entry.sa_p2mt = p2mt;
-            new_entry.access = p2ma;
-            new_entry.rsvd2_snp = (iommu_enabled && iommu_snoop);
-
-            new_entry.mfn = mfn_x(mfn);
-
-            if ( old_entry.mfn == new_entry.mfn )
-                need_modify_vtd_table = 0;
-
-            ept_p2m_type_to_flags(&new_entry, p2mt, p2ma);
-        }
-
-        atomic_write_ept_entry(ept_entry, new_entry);
-    }
-    else
-    {
-        /* We need to split the original page. */
-        ept_entry_t split_ept_entry;
-        ept_entry_t new_entry = { .epte = 0 };
-
-        ASSERT(is_epte_superpage(ept_entry));
-
-        split_ept_entry = atomic_read_ept_entry(ept_entry);
-
-        if ( !ept_split_super_page(p2m, &split_ept_entry, i, target) )
-        {
-            ept_free_entry(p2m, &split_ept_entry, i);
-            goto out;
-        }
-
-        /* now install the newly split ept sub-tree */
-        /* NB: please make sure domian is paused and no in-fly VT-d DMA. */
-        atomic_write_ept_entry(ept_entry, split_ept_entry);
-
-        /* then move to the level we want to make real changes */
-        for ( ; i > target; i-- )
-            ept_next_level(p2m, 0, &table, &gfn_remainder, i);
-
-        ASSERT(i == target);
-
-        index = gfn_remainder >> (i * EPT_TABLE_ORDER);
-        offset = gfn_remainder & ((1UL << (i * EPT_TABLE_ORDER)) - 1);
-
-        ept_entry = table + index;
-
-        new_entry.emt = epte_get_entry_emt(d, gfn, mfn, &ipat, direct_mmio);
-        new_entry.ipat = ipat;
-        new_entry.sp = i ? 1 : 0;
-        new_entry.sa_p2mt = p2mt;
-        new_entry.access = p2ma;
-        new_entry.rsvd2_snp = (iommu_enabled && iommu_snoop);
-
-        /* the caller should take care of the previous page */
-        new_entry.mfn = mfn_x(mfn);
-
-        /* Safe to read-then-write because we hold the p2m lock */
-        if ( ept_entry->mfn == new_entry.mfn )
-             need_modify_vtd_table = 0;
-
-        ept_p2m_type_to_flags(&new_entry, p2mt, p2ma);
-
-        atomic_write_ept_entry(ept_entry, new_entry);
-    }
-
-    /* Track the highest gfn for which we have ever had a valid mapping */
-    if ( mfn_valid(mfn_x(mfn)) &&
-         (gfn + (1UL << order) - 1 > p2m->max_mapped_pfn) )
-        p2m->max_mapped_pfn = gfn + (1UL << order) - 1;
-
-    /* Success */
-    rv = 1;
-
-out:
-    unmap_domain_page(table);
-
-    if ( needs_sync )
-        ept_sync_domain(p2m->domain);
-
-    if ( rv && iommu_enabled && need_iommu(p2m->domain) && 
need_modify_vtd_table )
-    {
-        if ( iommu_hap_pt_share )
-            iommu_pte_flush(d, gfn, (u64*)ept_entry, order, vtd_pte_present);
-        else
-        {
-            if ( p2mt == p2m_ram_rw )
-            {
-                if ( order > 0 )
-                {
-                    for ( i = 0; i < (1 << order); i++ )
-                        iommu_map_page(
-                            p2m->domain, gfn - offset + i, mfn_x(mfn) - offset 
+ i,
-                            IOMMUF_readable | IOMMUF_writable);
-                }
-                else if ( !order )
-                    iommu_map_page(
-                        p2m->domain, gfn, mfn_x(mfn), IOMMUF_readable | 
IOMMUF_writable);
-            }
-            else
-            {
-                if ( order > 0 )
-                {
-                    for ( i = 0; i < (1 << order); i++ )
-                        iommu_unmap_page(p2m->domain, gfn - offset + i);
-                }
-                else if ( !order )
-                    iommu_unmap_page(p2m->domain, gfn);
-            }
-        }
-    }
-
-    /* Release the old intermediate tables, if any.  This has to be the
-       last thing we do, after the ept_sync_domain() and removal
-       from the iommu tables, so as to avoid a potential
-       use-after-free. */
-    if ( is_epte_present(&old_entry) )
-        ept_free_entry(p2m, &old_entry, target);
-
-    return rv;
-}
-
-/* Read ept p2m entries */
-static mfn_t ept_get_entry(struct p2m_domain *p2m,
-                           unsigned long gfn, p2m_type_t *t, p2m_access_t* a,
-                           p2m_query_t q)
-{
-    struct domain *d = p2m->domain;
-    ept_entry_t *table = map_domain_page(ept_get_asr(d));
-    unsigned long gfn_remainder = gfn;
-    ept_entry_t *ept_entry;
-    u32 index;
-    int i;
-    int ret = 0;
-    mfn_t mfn = _mfn(INVALID_MFN);
-
-    *t = p2m_mmio_dm;
-    *a = p2m_access_n;
-
-    /* This pfn is higher than the highest the p2m map currently holds */
-    if ( gfn > p2m->max_mapped_pfn )
-        goto out;
-
-    /* Should check if gfn obeys GAW here. */
-
-    for ( i = ept_get_wl(d); i > 0; i-- )
-    {
-    retry:
-        ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i);
-        if ( !ret )
-            goto out;
-        else if ( ret == GUEST_TABLE_POD_PAGE )
-        {
-            if ( q == p2m_query )
-            {
-                *t = p2m_populate_on_demand;
-                goto out;
-            }
-
-            /* Populate this superpage */
-            ASSERT(i == 1);
-
-            index = gfn_remainder >> ( i * EPT_TABLE_ORDER);
-            ept_entry = table + index;
-
-            if ( !ept_pod_check_and_populate(p2m, gfn,
-                                             ept_entry, 9, q) )
-                goto retry;
-            else
-                goto out;
-        }
-        else if ( ret == GUEST_TABLE_SUPER_PAGE )
-            break;
-    }
-
-    index = gfn_remainder >> (i * EPT_TABLE_ORDER);
-    ept_entry = table + index;
-
-    if ( ept_entry->sa_p2mt == p2m_populate_on_demand )
-    {
-        if ( q == p2m_query )
-        {
-            *t = p2m_populate_on_demand;
-            goto out;
-        }
-
-        ASSERT(i == 0);
-        
-        if ( ept_pod_check_and_populate(p2m, gfn,
-                                        ept_entry, 0, q) )
-            goto out;
-    }
-
-    /* Need to check for all-zeroes because typecode 0 is p2m_ram and an
-     * entirely empty entry shouldn't have RAM type. */
-    if ( ept_entry->epte != 0 && ept_entry->sa_p2mt != p2m_invalid )
-    {
-        *t = ept_entry->sa_p2mt;
-        *a = ept_entry->access;
-
-        mfn = _mfn(ept_entry->mfn);
-        if ( i )
-        {
-            /* 
-             * We may meet super pages, and to split into 4k pages
-             * to emulate p2m table
-             */
-            unsigned long split_mfn = mfn_x(mfn) +
-                (gfn_remainder &
-                 ((1 << (i * EPT_TABLE_ORDER)) - 1));
-            mfn = _mfn(split_mfn);
-        }
-    }
-
-out:
-    unmap_domain_page(table);
-    return mfn;
-}
-
-/* WARNING: Only caller doesn't care about PoD pages.  So this function will
- * always return 0 for PoD pages, not populate them.  If that becomes 
necessary,
- * pass a p2m_query_t type along to distinguish. */
-static ept_entry_t ept_get_entry_content(struct p2m_domain *p2m,
-    unsigned long gfn, int *level)
-{
-    ept_entry_t *table = map_domain_page(ept_get_asr(p2m->domain));
-    unsigned long gfn_remainder = gfn;
-    ept_entry_t *ept_entry;
-    ept_entry_t content = { .epte = 0 };
-    u32 index;
-    int i;
-    int ret=0;
-
-    /* This pfn is higher than the highest the p2m map currently holds */
-    if ( gfn > p2m->max_mapped_pfn )
-        goto out;
-
-    for ( i = ept_get_wl(p2m->domain); i > 0; i-- )
-    {
-        ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i);
-        if ( !ret || ret == GUEST_TABLE_POD_PAGE )
-            goto out;
-        else if ( ret == GUEST_TABLE_SUPER_PAGE )
-            break;
-    }
-
-    index = gfn_remainder >> (i * EPT_TABLE_ORDER);
-    ept_entry = table + index;
-    content = *ept_entry;
-    *level = i;
-
- out:
-    unmap_domain_page(table);
-    return content;
-}
-
-void ept_walk_table(struct domain *d, unsigned long gfn)
-{
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-    ept_entry_t *table = map_domain_page(ept_get_asr(d));
-    unsigned long gfn_remainder = gfn;
-
-    int i;
-
-    gdprintk(XENLOG_ERR, "Walking EPT tables for domain %d gfn %lx\n",
-           d->domain_id, gfn);
-
-    /* This pfn is higher than the highest the p2m map currently holds */
-    if ( gfn > p2m->max_mapped_pfn )
-    {
-        gdprintk(XENLOG_ERR, " gfn exceeds max_mapped_pfn %lx\n",
-               p2m->max_mapped_pfn);
-        goto out;
-    }
-
-    for ( i = ept_get_wl(d); i >= 0; i-- )
-    {
-        ept_entry_t *ept_entry, *next;
-        u32 index;
-
-        /* Stolen from ept_next_level */
-        index = gfn_remainder >> (i*EPT_TABLE_ORDER);
-        ept_entry = table + index;
-
-        gdprintk(XENLOG_ERR, " epte %"PRIx64"\n", ept_entry->epte);
-
-        if ( (i == 0) || !is_epte_present(ept_entry) ||
-             is_epte_superpage(ept_entry) )
-            goto out;
-        else
-        {
-            gfn_remainder &= (1UL << (i*EPT_TABLE_ORDER)) - 1;
-
-            next = map_domain_page(ept_entry->mfn);
-
-            unmap_domain_page(table);
-
-            table = next;
-        }
-    }
-
-out:
-    unmap_domain_page(table);
-    return;
-}
-
-static mfn_t ept_get_entry_current(struct p2m_domain *p2m,
-                                   unsigned long gfn, p2m_type_t *t, 
p2m_access_t *a,
-                                   p2m_query_t q)
-{
-    return ept_get_entry(p2m, gfn, t, a, q);
-}
-
-/*
- * To test if the new emt type is the same with old,
- * return 1 to not to reset ept entry.
- */
-static int need_modify_ept_entry(struct p2m_domain *p2m, unsigned long gfn,
-                                 mfn_t mfn, uint8_t o_ipat, uint8_t o_emt,
-                                 p2m_type_t p2mt)
-{
-    uint8_t ipat;
-    uint8_t emt;
-    bool_t direct_mmio = (p2mt == p2m_mmio_direct);
-
-    emt = epte_get_entry_emt(p2m->domain, gfn, mfn, &ipat, direct_mmio);
-
-    if ( (emt == o_emt) && (ipat == o_ipat) )
-        return 0;
-
-    return 1;
-}
-
-void ept_change_entry_emt_with_range(struct domain *d,
-                                     unsigned long start_gfn,
-                                     unsigned long end_gfn)
-{
-    unsigned long gfn;
-    ept_entry_t e;
-    mfn_t mfn;
-    int order = 0;
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-
-    p2m_lock(p2m);
-    for ( gfn = start_gfn; gfn <= end_gfn; gfn++ )
-    {
-        int level = 0;
-        uint64_t trunk = 0;
-
-        e = ept_get_entry_content(p2m, gfn, &level);
-        if ( !p2m_has_emt(e.sa_p2mt) )
-            continue;
-
-        order = 0;
-        mfn = _mfn(e.mfn);
-
-        if ( is_epte_superpage(&e) )
-        {
-            while ( level )
-            {
-                trunk = (1UL << (level * EPT_TABLE_ORDER)) - 1;
-                if ( !(gfn & trunk) && (gfn + trunk <= end_gfn) )
-                {
-                    /* gfn assigned with 2M or 1G, and the end covers more than
-                     * the super page areas.
-                     * Set emt for super page.
-                     */
-                    order = level * EPT_TABLE_ORDER;
-                    if ( need_modify_ept_entry(p2m, gfn, mfn, 
-                          e.ipat, e.emt, e.sa_p2mt) )
-                        ept_set_entry(p2m, gfn, mfn, order, e.sa_p2mt, 
e.access);
-                    gfn += trunk;
-                    break;
-                }
-                level--;
-             }
-        }
-        else /* gfn assigned with 4k */
-        {
-            if ( need_modify_ept_entry(p2m, gfn, mfn, e.ipat, e.emt, 
e.sa_p2mt) )
-                ept_set_entry(p2m, gfn, mfn, order, e.sa_p2mt, e.access);
-        }
-    }
-    p2m_unlock(p2m);
-}
-
-/*
- * Walk the whole p2m table, changing any entries of the old type
- * to the new type.  This is used in hardware-assisted paging to
- * quickly enable or diable log-dirty tracking
- */
-static void ept_change_entry_type_page(mfn_t ept_page_mfn, int ept_page_level,
-                                       p2m_type_t ot, p2m_type_t nt)
-{
-    ept_entry_t e, *epte = map_domain_page(mfn_x(ept_page_mfn));
-
-    for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
-    {
-        if ( !is_epte_present(epte + i) )
-            continue;
-
-        if ( (ept_page_level > 0) && !is_epte_superpage(epte + i) )
-            ept_change_entry_type_page(_mfn(epte[i].mfn),
-                                       ept_page_level - 1, ot, nt);
-        else
-        {
-            e = atomic_read_ept_entry(&epte[i]);
-            if ( e.sa_p2mt != ot )
-                continue;
-
-            e.sa_p2mt = nt;
-            ept_p2m_type_to_flags(&e, nt, e.access);
-            atomic_write_ept_entry(&epte[i], e);
-        }
-    }
-
-    unmap_domain_page(epte);
-}
-
-static void ept_change_entry_type_global(struct p2m_domain *p2m,
-                                         p2m_type_t ot, p2m_type_t nt)
-{
-    struct domain *d = p2m->domain;
-    if ( ept_get_asr(d) == 0 )
-        return;
-
-    BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
-    BUG_ON(ot != nt && (ot == p2m_mmio_direct || nt == p2m_mmio_direct));
-
-    ept_change_entry_type_page(_mfn(ept_get_asr(d)), ept_get_wl(d), ot, nt);
-
-    ept_sync_domain(d);
-}
-
-void ept_p2m_init(struct domain *d)
-{
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-    p2m->set_entry = ept_set_entry;
-    p2m->get_entry = ept_get_entry;
-    p2m->get_entry_current = ept_get_entry_current;
-    p2m->change_entry_type_global = ept_change_entry_type_global;
-}
-
-static void ept_dump_p2m_table(unsigned char key)
-{
-    struct domain *d;
-    ept_entry_t *table, *ept_entry;
-    mfn_t mfn;
-    int order;
-    int i;
-    int is_pod;
-    int ret = 0;
-    unsigned long index;
-    unsigned long gfn, gfn_remainder;
-    unsigned long record_counter = 0;
-    struct p2m_domain *p2m;
-
-    for_each_domain(d)
-    {
-        if ( !hap_enabled(d) )
-            continue;
-
-        p2m = p2m_get_hostp2m(d);
-        printk("\ndomain%d EPT p2m table: \n", d->domain_id);
-
-        for ( gfn = 0; gfn <= p2m->max_mapped_pfn; gfn += (1 << order) )
-        {
-            gfn_remainder = gfn;
-            mfn = _mfn(INVALID_MFN);
-            table = map_domain_page(ept_get_asr(d));
-
-            for ( i = ept_get_wl(d); i > 0; i-- )
-            {
-                ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i);
-                if ( ret != GUEST_TABLE_NORMAL_PAGE )
-                    break;
-            }
-
-            order = i * EPT_TABLE_ORDER;
-
-            if ( ret == GUEST_TABLE_MAP_FAILED )
-                goto out;
-
-            index = gfn_remainder >> order;
-            ept_entry = table + index;
-            if ( ept_entry->sa_p2mt != p2m_invalid )
-            {
-                ( ept_entry->sa_p2mt == p2m_populate_on_demand ) ? 
-                ( mfn = _mfn(INVALID_MFN), is_pod = 1 ) :
-                ( mfn = _mfn(ept_entry->mfn), is_pod = 0 );
-
-                printk("gfn: %-16lx  mfn: %-16lx  order: %2d  is_pod: %d\n",
-                       gfn, mfn_x(mfn), order, is_pod);
-
-                if ( !(record_counter++ % 100) )
-                    process_pending_softirqs();
-            }
-out:
-            unmap_domain_page(table);
-        }
-    }
-}
-
-static struct keyhandler ept_p2m_table = {
-    .diagnostic = 0,
-    .u.fn = ept_dump_p2m_table,
-    .desc = "dump ept p2m table"
-};
-
-void setup_ept_dump(void)
-{
-    register_keyhandler('D', &ept_p2m_table);
-}
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff -r 4b0692880dfa -r 26c4beb6b520 xen/arch/x86/mm/p2m-ept.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/mm/p2m-ept.c Fri May 06 11:15:35 2011 +0100
@@ -0,0 +1,911 @@
+/*
+ * ept-p2m.c: use the EPT page table as p2m
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <xen/config.h>
+#include <xen/domain_page.h>
+#include <xen/sched.h>
+#include <asm/current.h>
+#include <asm/paging.h>
+#include <asm/types.h>
+#include <asm/domain.h>
+#include <asm/p2m.h>
+#include <asm/hvm/vmx/vmx.h>
+#include <asm/hvm/vmx/vmcs.h>
+#include <xen/iommu.h>
+#include <asm/mtrr.h>
+#include <asm/hvm/cacheattr.h>
+#include <xen/keyhandler.h>
+#include <xen/softirq.h>
+
+#define atomic_read_ept_entry(__pepte)                              \
+    ( (ept_entry_t) { .epte = atomic_read64(&(__pepte)->epte) } )
+#define atomic_write_ept_entry(__pepte, __epte)                     \
+    atomic_write64(&(__pepte)->epte, (__epte).epte)
+
+#define is_epte_present(ept_entry)      ((ept_entry)->epte & 0x7)
+#define is_epte_superpage(ept_entry)    ((ept_entry)->sp)
+
+/* Non-ept "lock-and-check" wrapper */
+static int ept_pod_check_and_populate(struct p2m_domain *p2m, unsigned long 
gfn,
+                                      ept_entry_t *entry, int order,
+                                      p2m_query_t q)
+{
+    /* Only take the lock if we don't already have it.  Otherwise it
+     * wouldn't be safe to do p2m lookups with the p2m lock held */
+    int do_locking = !p2m_locked_by_me(p2m);
+    int r;
+
+    if ( do_locking )
+        p2m_lock(p2m);
+
+    /* Check to make sure this is still PoD */
+    if ( entry->sa_p2mt != p2m_populate_on_demand )
+    {
+        if ( do_locking )
+            p2m_unlock(p2m);
+        return 0;
+    }
+
+    r = p2m_pod_demand_populate(p2m, gfn, order, q);
+
+    if ( do_locking )
+        p2m_unlock(p2m);
+
+    return r;
+}
+
+static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type, 
p2m_access_t access)
+{
+    /* First apply type permissions */
+    switch(type)
+    {
+        case p2m_invalid:
+        case p2m_mmio_dm:
+        case p2m_populate_on_demand:
+        case p2m_ram_paging_out:
+        case p2m_ram_paged:
+        case p2m_ram_paging_in:
+        case p2m_ram_paging_in_start:
+        default:
+            entry->r = entry->w = entry->x = 0;
+            break;
+        case p2m_ram_rw:
+            entry->r = entry->w = entry->x = 1;
+            break;
+        case p2m_mmio_direct:
+            entry->r = entry->x = 1;
+            entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
+                                                    entry->mfn);
+            break;
+        case p2m_ram_logdirty:
+        case p2m_ram_ro:
+        case p2m_ram_shared:
+            entry->r = entry->x = 1;
+            entry->w = 0;
+            break;
+        case p2m_grant_map_rw:
+            entry->r = entry->w = 1;
+            entry->x = 0;
+            break;
+        case p2m_grant_map_ro:
+            entry->r = 1;
+            entry->w = entry->x = 0;
+            break;
+    }
+
+
+    /* Then restrict with access permissions */
+    switch (access) 
+    {
+        case p2m_access_n:
+            entry->r = entry->w = entry->x = 0;
+            break;
+        case p2m_access_r:
+            entry->w = entry->x = 0;
+            break;
+        case p2m_access_w:
+            entry->r = entry->x = 0;
+            break;
+        case p2m_access_x:
+            entry->r = entry->w = 0;
+            break;
+        case p2m_access_rx:
+        case p2m_access_rx2rw:
+            entry->w = 0;
+            break;
+        case p2m_access_wx:
+            entry->r = 0;
+            break;
+        case p2m_access_rw:
+            entry->x = 0;
+            break;           
+        case p2m_access_rwx:
+            break;
+    }
+    
+}
+
+#define GUEST_TABLE_MAP_FAILED  0
+#define GUEST_TABLE_NORMAL_PAGE 1
+#define GUEST_TABLE_SUPER_PAGE  2
+#define GUEST_TABLE_POD_PAGE    3
+
+/* Fill in middle levels of ept table */
+static int ept_set_middle_entry(struct p2m_domain *p2m, ept_entry_t *ept_entry)
+{
+    struct page_info *pg;
+
+    pg = p2m_alloc_ptp(p2m, 0);
+    if ( pg == NULL )
+        return 0;
+
+    ept_entry->epte = 0;
+    ept_entry->mfn = page_to_mfn(pg);
+    ept_entry->access = p2m->default_access;
+
+    ept_entry->r = ept_entry->w = ept_entry->x = 1;
+
+    return 1;
+}
+
+/* free ept sub tree behind an entry */
+void ept_free_entry(struct p2m_domain *p2m, ept_entry_t *ept_entry, int level)
+{
+    /* End if the entry is a leaf entry. */
+    if ( level == 0 || !is_epte_present(ept_entry) ||
+         is_epte_superpage(ept_entry) )
+        return;
+
+    if ( level > 1 )
+    {
+        ept_entry_t *epte = map_domain_page(ept_entry->mfn);
+        for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
+            ept_free_entry(p2m, epte + i, level - 1);
+        unmap_domain_page(epte);
+    }
+    
+    p2m_free_ptp(p2m, mfn_to_page(ept_entry->mfn));
+}
+
+static int ept_split_super_page(struct p2m_domain *p2m, ept_entry_t *ept_entry,
+                                int level, int target)
+{
+    ept_entry_t new_ept, *table;
+    uint64_t trunk;
+    int rv = 1;
+
+    /* End if the entry is a leaf entry or reaches the target level. */
+    if ( level == 0 || level == target )
+        return rv;
+
+    ASSERT(is_epte_superpage(ept_entry));
+
+    if ( !ept_set_middle_entry(p2m, &new_ept) )
+        return 0;
+
+    table = map_domain_page(new_ept.mfn);
+    trunk = 1UL << ((level - 1) * EPT_TABLE_ORDER);
+
+    for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
+    {
+        ept_entry_t *epte = table + i;
+
+        epte->epte = 0;
+        epte->emt = ept_entry->emt;
+        epte->ipat = ept_entry->ipat;
+        epte->sp = (level > 1) ? 1 : 0;
+        epte->access = ept_entry->access;
+        epte->sa_p2mt = ept_entry->sa_p2mt;
+        epte->mfn = ept_entry->mfn + i * trunk;
+        epte->rsvd2_snp = ( iommu_enabled && iommu_snoop ) ? 1 : 0;
+
+        ept_p2m_type_to_flags(epte, epte->sa_p2mt, epte->access);
+
+        if ( (level - 1) == target )
+            continue;
+
+        ASSERT(is_epte_superpage(epte));
+
+        if ( !(rv = ept_split_super_page(p2m, epte, level - 1, target)) )
+            break;
+    }
+
+    unmap_domain_page(table);
+
+    /* Even failed we should install the newly allocated ept page. */
+    *ept_entry = new_ept;
+
+    return rv;
+}
+
+/* Take the currently mapped table, find the corresponding gfn entry,
+ * and map the next table, if available.  If the entry is empty
+ * and read_only is set, 
+ * Return values:
+ *  0: Failed to map.  Either read_only was set and the entry was
+ *   empty, or allocating a new page failed.
+ *  GUEST_TABLE_NORMAL_PAGE: next level mapped normally
+ *  GUEST_TABLE_SUPER_PAGE:
+ *   The next entry points to a superpage, and caller indicates
+ *   that they are going to the superpage level, or are only doing
+ *   a read.
+ *  GUEST_TABLE_POD:
+ *   The next entry is marked populate-on-demand.
+ */
+static int ept_next_level(struct p2m_domain *p2m, bool_t read_only,
+                          ept_entry_t **table, unsigned long *gfn_remainder,
+                          int next_level)
+{
+    unsigned long mfn;
+    ept_entry_t *ept_entry, e;
+    u32 shift, index;
+
+    shift = next_level * EPT_TABLE_ORDER;
+
+    index = *gfn_remainder >> shift;
+
+    /* index must be falling into the page */
+    ASSERT(index < EPT_PAGETABLE_ENTRIES);
+
+    ept_entry = (*table) + index;
+
+    /* ept_next_level() is called (sometimes) without a lock.  Read
+     * the entry once, and act on the "cached" entry after that to
+     * avoid races. */
+    e = atomic_read_ept_entry(ept_entry);
+
+    if ( !is_epte_present(&e) )
+    {
+        if ( e.sa_p2mt == p2m_populate_on_demand )
+            return GUEST_TABLE_POD_PAGE;
+
+        if ( read_only )
+            return GUEST_TABLE_MAP_FAILED;
+
+        if ( !ept_set_middle_entry(p2m, ept_entry) )
+            return GUEST_TABLE_MAP_FAILED;
+        else
+            e = atomic_read_ept_entry(ept_entry); /* Refresh */
+    }
+
+    /* The only time sp would be set here is if we had hit a superpage */
+    if ( is_epte_superpage(&e) )
+        return GUEST_TABLE_SUPER_PAGE;
+
+    mfn = e.mfn;
+    unmap_domain_page(*table);
+    *table = map_domain_page(mfn);
+    *gfn_remainder &= (1UL << shift) - 1;
+    return GUEST_TABLE_NORMAL_PAGE;
+}
+
+/*
+ * ept_set_entry() computes 'need_modify_vtd_table' for itself,
+ * by observing whether any gfn->mfn translations are modified.
+ */
+static int
+ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, 
+              unsigned int order, p2m_type_t p2mt, p2m_access_t p2ma)
+{
+    ept_entry_t *table, *ept_entry = NULL;
+    unsigned long gfn_remainder = gfn;
+    unsigned long offset = 0;
+    u32 index;
+    int i, target = order / EPT_TABLE_ORDER;
+    int rv = 0;
+    int ret = 0;
+    bool_t direct_mmio = (p2mt == p2m_mmio_direct);
+    uint8_t ipat = 0;
+    int need_modify_vtd_table = 1;
+    int vtd_pte_present = 0;
+    int needs_sync = 1;
+    struct domain *d = p2m->domain;
+    ept_entry_t old_entry = { .epte = 0 };
+
+    /*
+     * the caller must make sure:
+     * 1. passing valid gfn and mfn at order boundary.
+     * 2. gfn not exceeding guest physical address width.
+     * 3. passing a valid order.
+     */
+    if ( ((gfn | mfn_x(mfn)) & ((1UL << order) - 1)) ||
+         ((u64)gfn >> ((ept_get_wl(d) + 1) * EPT_TABLE_ORDER)) ||
+         (order % EPT_TABLE_ORDER) )
+        return 0;
+
+    ASSERT((target == 2 && hvm_hap_has_1gb(d)) ||
+           (target == 1 && hvm_hap_has_2mb(d)) ||
+           (target == 0));
+
+    table = map_domain_page(ept_get_asr(d));
+
+    ASSERT(table != NULL);
+
+    for ( i = ept_get_wl(d); i > target; i-- )
+    {
+        ret = ept_next_level(p2m, 0, &table, &gfn_remainder, i);
+        if ( !ret )
+            goto out;
+        else if ( ret != GUEST_TABLE_NORMAL_PAGE )
+            break;
+    }
+
+    ASSERT(ret != GUEST_TABLE_POD_PAGE || i != target);
+
+    index = gfn_remainder >> (i * EPT_TABLE_ORDER);
+    offset = gfn_remainder & ((1UL << (i * EPT_TABLE_ORDER)) - 1);
+
+    ept_entry = table + index;
+
+    /* In case VT-d uses same page table, this flag is needed by VT-d */ 
+    vtd_pte_present = is_epte_present(ept_entry) ? 1 : 0;
+
+    /*
+     * If we're here with i > target, we must be at a leaf node, and
+     * we need to break up the superpage.
+     *
+     * If we're here with i == target and i > 0, we need to check to see
+     * if we're replacing a non-leaf entry (i.e., pointing to an N-1 table)
+     * with a leaf entry (a 1GiB or 2MiB page), and handle things 
appropriately.
+     */
+
+    if ( i == target )
+    {
+        /* We reached the target level. */
+        ept_entry_t new_entry = { .epte = 0 };
+
+        /* No need to flush if the old entry wasn't valid */
+        if ( !is_epte_present(ept_entry) )
+            needs_sync = 0;
+
+        /* If we're replacing a non-leaf entry with a leaf entry (1GiB or 
2MiB),
+         * the intermediate tables will be freed below after the ept flush
+         *
+         * Read-then-write is OK because we hold the p2m lock. */
+        old_entry = *ept_entry;
+
+        if ( mfn_valid(mfn_x(mfn)) || direct_mmio || p2m_is_paged(p2mt) ||
+             (p2mt == p2m_ram_paging_in_start) )
+        {
+            /* Construct the new entry, and then write it once */
+            new_entry.emt = epte_get_entry_emt(p2m->domain, gfn, mfn, &ipat,
+                                                direct_mmio);
+
+            new_entry.ipat = ipat;
+            new_entry.sp = order ? 1 : 0;
+            new_entry.sa_p2mt = p2mt;
+            new_entry.access = p2ma;
+            new_entry.rsvd2_snp = (iommu_enabled && iommu_snoop);
+
+            new_entry.mfn = mfn_x(mfn);
+
+            if ( old_entry.mfn == new_entry.mfn )
+                need_modify_vtd_table = 0;
+
+            ept_p2m_type_to_flags(&new_entry, p2mt, p2ma);
+        }
+
+        atomic_write_ept_entry(ept_entry, new_entry);
+    }
+    else
+    {
+        /* We need to split the original page. */
+        ept_entry_t split_ept_entry;
+        ept_entry_t new_entry = { .epte = 0 };
+
+        ASSERT(is_epte_superpage(ept_entry));
+
+        split_ept_entry = atomic_read_ept_entry(ept_entry);
+
+        if ( !ept_split_super_page(p2m, &split_ept_entry, i, target) )
+        {
+            ept_free_entry(p2m, &split_ept_entry, i);
+            goto out;
+        }
+
+        /* now install the newly split ept sub-tree */
+        /* NB: please make sure domian is paused and no in-fly VT-d DMA. */
+        atomic_write_ept_entry(ept_entry, split_ept_entry);
+
+        /* then move to the level we want to make real changes */
+        for ( ; i > target; i-- )
+            ept_next_level(p2m, 0, &table, &gfn_remainder, i);
+
+        ASSERT(i == target);
+
+        index = gfn_remainder >> (i * EPT_TABLE_ORDER);
+        offset = gfn_remainder & ((1UL << (i * EPT_TABLE_ORDER)) - 1);
+
+        ept_entry = table + index;
+
+        new_entry.emt = epte_get_entry_emt(d, gfn, mfn, &ipat, direct_mmio);
+        new_entry.ipat = ipat;
+        new_entry.sp = i ? 1 : 0;
+        new_entry.sa_p2mt = p2mt;
+        new_entry.access = p2ma;
+        new_entry.rsvd2_snp = (iommu_enabled && iommu_snoop);
+
+        /* the caller should take care of the previous page */
+        new_entry.mfn = mfn_x(mfn);
+
+        /* Safe to read-then-write because we hold the p2m lock */
+        if ( ept_entry->mfn == new_entry.mfn )
+             need_modify_vtd_table = 0;
+
+        ept_p2m_type_to_flags(&new_entry, p2mt, p2ma);
+
+        atomic_write_ept_entry(ept_entry, new_entry);
+    }
+
+    /* Track the highest gfn for which we have ever had a valid mapping */
+    if ( mfn_valid(mfn_x(mfn)) &&
+         (gfn + (1UL << order) - 1 > p2m->max_mapped_pfn) )
+        p2m->max_mapped_pfn = gfn + (1UL << order) - 1;
+
+    /* Success */
+    rv = 1;
+
+out:
+    unmap_domain_page(table);
+
+    if ( needs_sync )
+        ept_sync_domain(p2m->domain);
+
+    if ( rv && iommu_enabled && need_iommu(p2m->domain) && 
need_modify_vtd_table )
+    {
+        if ( iommu_hap_pt_share )
+            iommu_pte_flush(d, gfn, (u64*)ept_entry, order, vtd_pte_present);
+        else
+        {
+            if ( p2mt == p2m_ram_rw )
+            {
+                if ( order > 0 )
+                {
+                    for ( i = 0; i < (1 << order); i++ )
+                        iommu_map_page(
+                            p2m->domain, gfn - offset + i, mfn_x(mfn) - offset 
+ i,
+                            IOMMUF_readable | IOMMUF_writable);
+                }
+                else if ( !order )
+                    iommu_map_page(
+                        p2m->domain, gfn, mfn_x(mfn), IOMMUF_readable | 
IOMMUF_writable);
+            }
+            else
+            {
+                if ( order > 0 )
+                {
+                    for ( i = 0; i < (1 << order); i++ )
+                        iommu_unmap_page(p2m->domain, gfn - offset + i);
+                }
+                else if ( !order )
+                    iommu_unmap_page(p2m->domain, gfn);
+            }
+        }
+    }
+
+    /* Release the old intermediate tables, if any.  This has to be the
+       last thing we do, after the ept_sync_domain() and removal
+       from the iommu tables, so as to avoid a potential
+       use-after-free. */
+    if ( is_epte_present(&old_entry) )
+        ept_free_entry(p2m, &old_entry, target);
+
+    return rv;
+}
+
+/* Read ept p2m entries */
+static mfn_t ept_get_entry(struct p2m_domain *p2m,
+                           unsigned long gfn, p2m_type_t *t, p2m_access_t* a,
+                           p2m_query_t q)
+{
+    struct domain *d = p2m->domain;
+    ept_entry_t *table = map_domain_page(ept_get_asr(d));
+    unsigned long gfn_remainder = gfn;
+    ept_entry_t *ept_entry;
+    u32 index;
+    int i;
+    int ret = 0;
+    mfn_t mfn = _mfn(INVALID_MFN);
+
+    *t = p2m_mmio_dm;
+    *a = p2m_access_n;
+
+    /* This pfn is higher than the highest the p2m map currently holds */
+    if ( gfn > p2m->max_mapped_pfn )
+        goto out;
+
+    /* Should check if gfn obeys GAW here. */
+
+    for ( i = ept_get_wl(d); i > 0; i-- )
+    {
+    retry:
+        ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i);
+        if ( !ret )
+            goto out;
+        else if ( ret == GUEST_TABLE_POD_PAGE )
+        {
+            if ( q == p2m_query )
+            {
+                *t = p2m_populate_on_demand;
+                goto out;
+            }
+
+            /* Populate this superpage */
+            ASSERT(i == 1);
+
+            index = gfn_remainder >> ( i * EPT_TABLE_ORDER);
+            ept_entry = table + index;
+
+            if ( !ept_pod_check_and_populate(p2m, gfn,
+                                             ept_entry, 9, q) )
+                goto retry;
+            else
+                goto out;
+        }
+        else if ( ret == GUEST_TABLE_SUPER_PAGE )
+            break;
+    }
+
+    index = gfn_remainder >> (i * EPT_TABLE_ORDER);
+    ept_entry = table + index;
+
+    if ( ept_entry->sa_p2mt == p2m_populate_on_demand )
+    {
+        if ( q == p2m_query )
+        {
+            *t = p2m_populate_on_demand;
+            goto out;
+        }
+
+        ASSERT(i == 0);
+        
+        if ( ept_pod_check_and_populate(p2m, gfn,
+                                        ept_entry, 0, q) )
+            goto out;
+    }
+
+    /* Need to check for all-zeroes because typecode 0 is p2m_ram and an
+     * entirely empty entry shouldn't have RAM type. */
+    if ( ept_entry->epte != 0 && ept_entry->sa_p2mt != p2m_invalid )
+    {
+        *t = ept_entry->sa_p2mt;
+        *a = ept_entry->access;
+
+        mfn = _mfn(ept_entry->mfn);
+        if ( i )
+        {
+            /* 
+             * We may meet super pages, and to split into 4k pages
+             * to emulate p2m table
+             */
+            unsigned long split_mfn = mfn_x(mfn) +
+                (gfn_remainder &
+                 ((1 << (i * EPT_TABLE_ORDER)) - 1));
+            mfn = _mfn(split_mfn);
+        }
+    }
+
+out:
+    unmap_domain_page(table);
+    return mfn;
+}
+
+/* WARNING: Only caller doesn't care about PoD pages.  So this function will
+ * always return 0 for PoD pages, not populate them.  If that becomes 
necessary,
+ * pass a p2m_query_t type along to distinguish. */
+static ept_entry_t ept_get_entry_content(struct p2m_domain *p2m,
+    unsigned long gfn, int *level)
+{
+    ept_entry_t *table = map_domain_page(ept_get_asr(p2m->domain));
+    unsigned long gfn_remainder = gfn;
+    ept_entry_t *ept_entry;
+    ept_entry_t content = { .epte = 0 };
+    u32 index;
+    int i;
+    int ret=0;
+
+    /* This pfn is higher than the highest the p2m map currently holds */
+    if ( gfn > p2m->max_mapped_pfn )
+        goto out;
+
+    for ( i = ept_get_wl(p2m->domain); i > 0; i-- )
+    {
+        ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i);
+        if ( !ret || ret == GUEST_TABLE_POD_PAGE )
+            goto out;
+        else if ( ret == GUEST_TABLE_SUPER_PAGE )
+            break;
+    }
+
+    index = gfn_remainder >> (i * EPT_TABLE_ORDER);
+    ept_entry = table + index;
+    content = *ept_entry;
+    *level = i;
+
+ out:
+    unmap_domain_page(table);
+    return content;
+}
+
+void ept_walk_table(struct domain *d, unsigned long gfn)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    ept_entry_t *table = map_domain_page(ept_get_asr(d));
+    unsigned long gfn_remainder = gfn;
+
+    int i;
+
+    gdprintk(XENLOG_ERR, "Walking EPT tables for domain %d gfn %lx\n",
+           d->domain_id, gfn);
+
+    /* This pfn is higher than the highest the p2m map currently holds */
+    if ( gfn > p2m->max_mapped_pfn )
+    {
+        gdprintk(XENLOG_ERR, " gfn exceeds max_mapped_pfn %lx\n",
+               p2m->max_mapped_pfn);
+        goto out;
+    }
+
+    for ( i = ept_get_wl(d); i >= 0; i-- )
+    {
+        ept_entry_t *ept_entry, *next;
+        u32 index;
+
+        /* Stolen from ept_next_level */
+        index = gfn_remainder >> (i*EPT_TABLE_ORDER);
+        ept_entry = table + index;
+
+        gdprintk(XENLOG_ERR, " epte %"PRIx64"\n", ept_entry->epte);
+
+        if ( (i == 0) || !is_epte_present(ept_entry) ||
+             is_epte_superpage(ept_entry) )
+            goto out;
+        else
+        {
+            gfn_remainder &= (1UL << (i*EPT_TABLE_ORDER)) - 1;
+
+            next = map_domain_page(ept_entry->mfn);
+
+            unmap_domain_page(table);
+
+            table = next;
+        }
+    }
+
+out:
+    unmap_domain_page(table);
+    return;
+}
+
+static mfn_t ept_get_entry_current(struct p2m_domain *p2m,
+                                   unsigned long gfn, p2m_type_t *t, 
p2m_access_t *a,
+                                   p2m_query_t q)
+{
+    return ept_get_entry(p2m, gfn, t, a, q);
+}
+
+/*
+ * To test if the new emt type is the same with old,
+ * return 1 to not to reset ept entry.
+ */
+static int need_modify_ept_entry(struct p2m_domain *p2m, unsigned long gfn,
+                                 mfn_t mfn, uint8_t o_ipat, uint8_t o_emt,
+                                 p2m_type_t p2mt)
+{
+    uint8_t ipat;
+    uint8_t emt;
+    bool_t direct_mmio = (p2mt == p2m_mmio_direct);
+
+    emt = epte_get_entry_emt(p2m->domain, gfn, mfn, &ipat, direct_mmio);
+
+    if ( (emt == o_emt) && (ipat == o_ipat) )
+        return 0;
+
+    return 1;
+}
+
+void ept_change_entry_emt_with_range(struct domain *d,
+                                     unsigned long start_gfn,
+                                     unsigned long end_gfn)
+{
+    unsigned long gfn;
+    ept_entry_t e;
+    mfn_t mfn;
+    int order = 0;
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    p2m_lock(p2m);
+    for ( gfn = start_gfn; gfn <= end_gfn; gfn++ )
+    {
+        int level = 0;
+        uint64_t trunk = 0;
+
+        e = ept_get_entry_content(p2m, gfn, &level);
+        if ( !p2m_has_emt(e.sa_p2mt) )
+            continue;
+
+        order = 0;
+        mfn = _mfn(e.mfn);
+
+        if ( is_epte_superpage(&e) )
+        {
+            while ( level )
+            {
+                trunk = (1UL << (level * EPT_TABLE_ORDER)) - 1;
+                if ( !(gfn & trunk) && (gfn + trunk <= end_gfn) )
+                {
+                    /* gfn assigned with 2M or 1G, and the end covers more than
+                     * the super page areas.
+                     * Set emt for super page.
+                     */
+                    order = level * EPT_TABLE_ORDER;
+                    if ( need_modify_ept_entry(p2m, gfn, mfn, 
+                          e.ipat, e.emt, e.sa_p2mt) )
+                        ept_set_entry(p2m, gfn, mfn, order, e.sa_p2mt, 
e.access);
+                    gfn += trunk;
+                    break;
+                }
+                level--;
+             }
+        }
+        else /* gfn assigned with 4k */
+        {
+            if ( need_modify_ept_entry(p2m, gfn, mfn, e.ipat, e.emt, 
e.sa_p2mt) )
+                ept_set_entry(p2m, gfn, mfn, order, e.sa_p2mt, e.access);
+        }
+    }
+    p2m_unlock(p2m);
+}
+
+/*
+ * Walk the whole p2m table, changing any entries of the old type
+ * to the new type.  This is used in hardware-assisted paging to
+ * quickly enable or diable log-dirty tracking
+ */
+static void ept_change_entry_type_page(mfn_t ept_page_mfn, int ept_page_level,
+                                       p2m_type_t ot, p2m_type_t nt)
+{
+    ept_entry_t e, *epte = map_domain_page(mfn_x(ept_page_mfn));
+
+    for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
+    {
+        if ( !is_epte_present(epte + i) )
+            continue;
+
+        if ( (ept_page_level > 0) && !is_epte_superpage(epte + i) )
+            ept_change_entry_type_page(_mfn(epte[i].mfn),
+                                       ept_page_level - 1, ot, nt);
+        else
+        {
+            e = atomic_read_ept_entry(&epte[i]);
+            if ( e.sa_p2mt != ot )
+                continue;
+
+            e.sa_p2mt = nt;
+            ept_p2m_type_to_flags(&e, nt, e.access);
+            atomic_write_ept_entry(&epte[i], e);
+        }
+    }
+
+    unmap_domain_page(epte);
+}
+
+static void ept_change_entry_type_global(struct p2m_domain *p2m,
+                                         p2m_type_t ot, p2m_type_t nt)
+{
+    struct domain *d = p2m->domain;
+    if ( ept_get_asr(d) == 0 )
+        return;
+
+    BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
+    BUG_ON(ot != nt && (ot == p2m_mmio_direct || nt == p2m_mmio_direct));
+
+    ept_change_entry_type_page(_mfn(ept_get_asr(d)), ept_get_wl(d), ot, nt);
+
+    ept_sync_domain(d);
+}
+
+void ept_p2m_init(struct domain *d)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    p2m->set_entry = ept_set_entry;
+    p2m->get_entry = ept_get_entry;
+    p2m->get_entry_current = ept_get_entry_current;
+    p2m->change_entry_type_global = ept_change_entry_type_global;
+}
+
+static void ept_dump_p2m_table(unsigned char key)
+{
+    struct domain *d;
+    ept_entry_t *table, *ept_entry;
+    mfn_t mfn;
+    int order;
+    int i;
+    int is_pod;
+    int ret = 0;
+    unsigned long index;
+    unsigned long gfn, gfn_remainder;
+    unsigned long record_counter = 0;
+    struct p2m_domain *p2m;
+
+    for_each_domain(d)
+    {
+        if ( !hap_enabled(d) )
+            continue;
+
+        p2m = p2m_get_hostp2m(d);
+        printk("\ndomain%d EPT p2m table: \n", d->domain_id);
+
+        for ( gfn = 0; gfn <= p2m->max_mapped_pfn; gfn += (1 << order) )
+        {
+            gfn_remainder = gfn;
+            mfn = _mfn(INVALID_MFN);
+            table = map_domain_page(ept_get_asr(d));
+
+            for ( i = ept_get_wl(d); i > 0; i-- )
+            {
+                ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i);
+                if ( ret != GUEST_TABLE_NORMAL_PAGE )
+                    break;
+            }
+
+            order = i * EPT_TABLE_ORDER;
+
+            if ( ret == GUEST_TABLE_MAP_FAILED )
+                goto out;
+
+            index = gfn_remainder >> order;
+            ept_entry = table + index;
+            if ( ept_entry->sa_p2mt != p2m_invalid )
+            {
+                ( ept_entry->sa_p2mt == p2m_populate_on_demand ) ? 
+                ( mfn = _mfn(INVALID_MFN), is_pod = 1 ) :
+                ( mfn = _mfn(ept_entry->mfn), is_pod = 0 );
+
+                printk("gfn: %-16lx  mfn: %-16lx  order: %2d  is_pod: %d\n",
+                       gfn, mfn_x(mfn), order, is_pod);
+
+                if ( !(record_counter++ % 100) )
+                    process_pending_softirqs();
+            }
+out:
+            unmap_domain_page(table);
+        }
+    }
+}
+
+static struct keyhandler ept_p2m_table = {
+    .diagnostic = 0,
+    .u.fn = ept_dump_p2m_table,
+    .desc = "dump ept p2m table"
+};
+
+void setup_ept_dump(void)
+{
+    register_keyhandler('D', &ept_p2m_table);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 4b0692880dfa -r 26c4beb6b520 xen/arch/x86/mm/p2m-pod.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/mm/p2m-pod.c Fri May 06 11:15:35 2011 +0100
@@ -0,0 +1,1151 @@
+/******************************************************************************
+ * arch/x86/mm/p2m-pod.c
+ *
+ * Populate-on-demand p2m entries. 
+ *
+ * Copyright (c) 2009-2011 Citrix Systems, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <asm/domain.h>
+#include <asm/page.h>
+#include <asm/paging.h>
+#include <asm/p2m.h>
+#include <asm/hvm/vmx/vmx.h> /* ept_p2m_init() */
+#include <xen/iommu.h>
+#include <asm/mem_event.h>
+#include <public/mem_event.h>
+#include <asm/mem_sharing.h>
+#include <xen/event.h>
+#include <asm/hvm/nestedhvm.h>
+#include <asm/hvm/svm/amd-iommu-proto.h>
+ 
+/* Printouts */
+#define P2M_PRINTK(_f, _a...)                                \
+    debugtrace_printk("p2m: %s(): " _f, __func__, ##_a)
+#define P2M_ERROR(_f, _a...)                                 \
+    printk("pg error: %s(): " _f, __func__, ##_a)
+#if P2M_DEBUGGING
+#define P2M_DEBUG(_f, _a...)                                 \
+    debugtrace_printk("p2mdebug: %s(): " _f, __func__, ##_a)
+#else
+#define P2M_DEBUG(_f, _a...) do { (void)(_f); } while(0)
+#endif
+
+
+/* Override macros from asm/page.h to make them work with mfn_t */
+#undef mfn_to_page
+#define mfn_to_page(_m) __mfn_to_page(mfn_x(_m))
+#undef mfn_valid
+#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
+#undef page_to_mfn
+#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
+
+#if P2M_AUDIT
+extern void audit_p2m(struct p2m_domain *p2m, int strict_m2p);
+#else
+# define audit_p2m(_p2m, _m2p) do { (void)(_p2m),(_m2p); } while (0)
+#endif /* P2M_AUDIT */
+
+#define SUPERPAGE_PAGES (1UL << 9)
+#define superpage_aligned(_x)  (((_x)&(SUPERPAGE_PAGES-1))==0)
+
+/*
+ * Populate-on-demand functionality
+ */
+
+static int
+p2m_pod_cache_add(struct p2m_domain *p2m,
+                  struct page_info *page,
+                  unsigned long order)
+{
+    int i;
+    struct page_info *p;
+    struct domain *d = p2m->domain;
+
+#ifndef NDEBUG
+    mfn_t mfn;
+
+    mfn = page_to_mfn(page);
+
+    /* Check to make sure this is a contiguous region */
+    if( mfn_x(mfn) & ((1 << order) - 1) )
+    {
+        printk("%s: mfn %lx not aligned order %lu! (mask %lx)\n",
+               __func__, mfn_x(mfn), order, ((1UL << order) - 1));
+        return -1;
+    }
+    
+    for(i=0; i < 1 << order ; i++) {
+        struct domain * od;
+
+        p = mfn_to_page(_mfn(mfn_x(mfn) + i));
+        od = page_get_owner(p);
+        if(od != d)
+        {
+            printk("%s: mfn %lx expected owner d%d, got owner d%d!\n",
+                   __func__, mfn_x(mfn), d->domain_id,
+                   od?od->domain_id:-1);
+            return -1;
+        }
+    }
+#endif
+
+    ASSERT(p2m_locked_by_me(p2m));
+
+    /*
+     * Pages from domain_alloc and returned by the balloon driver aren't
+     * guaranteed to be zero; but by reclaiming zero pages, we implicitly
+     * promise to provide zero pages. So we scrub pages before using.
+     */
+    for ( i = 0; i < (1 << order); i++ )
+    {
+        char *b = map_domain_page(mfn_x(page_to_mfn(page)) + i);
+        clear_page(b);
+        unmap_domain_page(b);
+    }
+
+    spin_lock(&d->page_alloc_lock);
+
+    /* First, take all pages off the domain list */
+    for(i=0; i < 1 << order ; i++)
+    {
+        p = page + i;
+        page_list_del(p, &d->page_list);
+    }
+
+    /* Then add the first one to the appropriate populate-on-demand list */
+    switch(order)
+    {
+    case 9:
+        page_list_add_tail(page, &p2m->pod.super); /* lock: page_alloc */
+        p2m->pod.count += 1 << order;
+        break;
+    case 0:
+        page_list_add_tail(page, &p2m->pod.single); /* lock: page_alloc */
+        p2m->pod.count += 1;
+        break;
+    default:
+        BUG();
+    }
+
+    /* Ensure that the PoD cache has never been emptied.  
+     * This may cause "zombie domains" since the page will never be freed. */
+    BUG_ON( d->arch.relmem != RELMEM_not_started );
+
+    spin_unlock(&d->page_alloc_lock);
+
+    return 0;
+}
+
+/* Get a page of size order from the populate-on-demand cache.  Will break
+ * down 2-meg pages into singleton pages automatically.  Returns null if
+ * a superpage is requested and no superpages are available.  Must be called
+ * with the d->page_lock held. */
+static struct page_info * p2m_pod_cache_get(struct p2m_domain *p2m,
+                                            unsigned long order)
+{
+    struct page_info *p = NULL;
+    int i;
+
+    if ( order == 9 && page_list_empty(&p2m->pod.super) )
+    {
+        return NULL;
+    }
+    else if ( order == 0 && page_list_empty(&p2m->pod.single) )
+    {
+        unsigned long mfn;
+        struct page_info *q;
+
+        BUG_ON( page_list_empty(&p2m->pod.super) );
+
+        /* Break up a superpage to make single pages. NB count doesn't
+         * need to be adjusted. */
+        p = page_list_remove_head(&p2m->pod.super);
+        mfn = mfn_x(page_to_mfn(p));
+
+        for ( i=0; i<SUPERPAGE_PAGES; i++ )
+        {
+            q = mfn_to_page(_mfn(mfn+i));
+            page_list_add_tail(q, &p2m->pod.single);
+        }
+    }
+
+    switch ( order )
+    {
+    case 9:
+        BUG_ON( page_list_empty(&p2m->pod.super) );
+        p = page_list_remove_head(&p2m->pod.super);
+        p2m->pod.count -= 1 << order; /* Lock: page_alloc */
+        break;
+    case 0:
+        BUG_ON( page_list_empty(&p2m->pod.single) );
+        p = page_list_remove_head(&p2m->pod.single);
+        p2m->pod.count -= 1;
+        break;
+    default:
+        BUG();
+    }
+
+    /* Put the pages back on the domain page_list */
+    for ( i = 0 ; i < (1 << order); i++ )
+    {
+        BUG_ON(page_get_owner(p + i) != p2m->domain);
+        page_list_add_tail(p + i, &p2m->domain->page_list);
+    }
+
+    return p;
+}
+
+/* Set the size of the cache, allocating or freeing as necessary. */
+static int
+p2m_pod_set_cache_target(struct p2m_domain *p2m, unsigned long pod_target, int 
preemptible)
+{
+    struct domain *d = p2m->domain;
+    int ret = 0;
+
+    /* Increasing the target */
+    while ( pod_target > p2m->pod.count )
+    {
+        struct page_info * page;
+        int order;
+
+        if ( (pod_target - p2m->pod.count) >= SUPERPAGE_PAGES )
+            order = 9;
+        else
+            order = 0;
+    retry:
+        page = alloc_domheap_pages(d, order, 0);
+        if ( unlikely(page == NULL) )
+        {
+            if ( order == 9 )
+            {
+                /* If we can't allocate a superpage, try singleton pages */
+                order = 0;
+                goto retry;
+            }   
+            
+            printk("%s: Unable to allocate domheap page for pod cache.  target 
%lu cachesize %d\n",
+                   __func__, pod_target, p2m->pod.count);
+            ret = -ENOMEM;
+            goto out;
+        }
+
+        p2m_pod_cache_add(p2m, page, order);
+
+        if ( hypercall_preempt_check() && preemptible )
+        {
+            ret = -EAGAIN;
+            goto out;
+        }
+    }
+
+    /* Decreasing the target */
+    /* We hold the p2m lock here, so we don't need to worry about
+     * cache disappearing under our feet. */
+    while ( pod_target < p2m->pod.count )
+    {
+        struct page_info * page;
+        int order, i;
+
+        /* Grab the lock before checking that pod.super is empty, or the last
+         * entries may disappear before we grab the lock. */
+        spin_lock(&d->page_alloc_lock);
+
+        if ( (p2m->pod.count - pod_target) > SUPERPAGE_PAGES
+             && !page_list_empty(&p2m->pod.super) )
+            order = 9;
+        else
+            order = 0;
+
+        page = p2m_pod_cache_get(p2m, order);
+
+        ASSERT(page != NULL);
+
+        spin_unlock(&d->page_alloc_lock);
+
+        /* Then free them */
+        for ( i = 0 ; i < (1 << order) ; i++ )
+        {
+            /* Copied from common/memory.c:guest_remove_page() */
+            if ( unlikely(!get_page(page+i, d)) )
+            {
+                gdprintk(XENLOG_INFO, "Bad page free for domain %u\n", 
d->domain_id);
+                ret = -EINVAL;
+                goto out;
+            }
+
+            if ( test_and_clear_bit(_PGT_pinned, &(page+i)->u.inuse.type_info) 
)
+                put_page_and_type(page+i);
+            
+            if ( test_and_clear_bit(_PGC_allocated, &(page+i)->count_info) )
+                put_page(page+i);
+
+            put_page(page+i);
+
+            if ( hypercall_preempt_check() && preemptible )
+            {
+                ret = -EAGAIN;
+                goto out;
+            }
+        }
+    }
+
+out:
+    return ret;
+}
+
+/*
+ * The "right behavior" here requires some careful thought.  First, some
+ * definitions:
+ * + M: static_max
+ * + B: number of pages the balloon driver has ballooned down to.
+ * + P: Number of populated pages. 
+ * + T: Old target
+ * + T': New target
+ *
+ * The following equations should hold:
+ *  0 <= P <= T <= B <= M
+ *  d->arch.p2m->pod.entry_count == B - P
+ *  d->tot_pages == P + d->arch.p2m->pod.count
+ *
+ * Now we have the following potential cases to cover:
+ *     B <T': Set the PoD cache size equal to the number of outstanding PoD
+ *   entries.  The balloon driver will deflate the balloon to give back
+ *   the remainder of the ram to the guest OS.
+ *  T <T'<B : Increase PoD cache size.
+ *  T'<T<=B : Here we have a choice.  We can decrease the size of the cache,
+ *   get the memory right away.  However, that means every time we 
+ *   reduce the memory target we risk the guest attempting to populate the 
+ *   memory before the balloon driver has reached its new target.  Safer to
+ *   never reduce the cache size here, but only when the balloon driver frees 
+ *   PoD ranges.
+ *
+ * If there are many zero pages, we could reach the target also by doing
+ * zero sweeps and marking the ranges PoD; but the balloon driver will have
+ * to free this memory eventually anyway, so we don't actually gain that much
+ * by doing so.
+ *
+ * NB that the equation (B<T') may require adjustment to the cache
+ * size as PoD pages are freed as well; i.e., freeing a PoD-backed
+ * entry when pod.entry_count == pod.count requires us to reduce both
+ * pod.entry_count and pod.count.
+ */
+int
+p2m_pod_set_mem_target(struct domain *d, unsigned long target)
+{
+    unsigned pod_target;
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    int ret = 0;
+    unsigned long populated;
+
+    p2m_lock(p2m);
+
+    /* P == B: Nothing to do. */
+    if ( p2m->pod.entry_count == 0 )
+        goto out;
+
+    /* Don't do anything if the domain is being torn down */
+    if ( d->is_dying )
+        goto out;
+
+    /* T' < B: Don't reduce the cache size; let the balloon driver
+     * take care of it. */
+    if ( target < d->tot_pages )
+        goto out;
+
+    populated  = d->tot_pages - p2m->pod.count;
+
+    pod_target = target - populated;
+
+    /* B < T': Set the cache size equal to # of outstanding entries,
+     * let the balloon driver fill in the rest. */
+    if ( pod_target > p2m->pod.entry_count )
+        pod_target = p2m->pod.entry_count;
+
+    ASSERT( pod_target >= p2m->pod.count );
+
+    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
+
+out:
+    p2m_unlock(p2m);
+
+    return ret;
+}
+
+void
+p2m_pod_empty_cache(struct domain *d)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    struct page_info *page;
+
+    /* After this barrier no new PoD activities can happen. */
+    BUG_ON(!d->is_dying);
+    spin_barrier(&p2m->lock);
+
+    spin_lock(&d->page_alloc_lock);
+
+    while ( (page = page_list_remove_head(&p2m->pod.super)) )
+    {
+        int i;
+            
+        for ( i = 0 ; i < SUPERPAGE_PAGES ; i++ )
+        {
+            BUG_ON(page_get_owner(page + i) != d);
+            page_list_add_tail(page + i, &d->page_list);
+        }
+
+        p2m->pod.count -= SUPERPAGE_PAGES;
+    }
+
+    while ( (page = page_list_remove_head(&p2m->pod.single)) )
+    {
+        BUG_ON(page_get_owner(page) != d);
+        page_list_add_tail(page, &d->page_list);
+
+        p2m->pod.count -= 1;
+    }
+
+    BUG_ON(p2m->pod.count != 0);
+
+    spin_unlock(&d->page_alloc_lock);
+}
+
+int
+p2m_pod_offline_or_broken_hit(struct page_info *p)
+{
+    struct domain *d;
+    struct p2m_domain *p2m;
+    struct page_info *q, *tmp;
+    unsigned long mfn, bmfn;
+
+    if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) )
+        return 0;
+
+    spin_lock(&d->page_alloc_lock);
+    bmfn = mfn_x(page_to_mfn(p));
+    page_list_for_each_safe(q, tmp, &p2m->pod.super)
+    {
+        mfn = mfn_x(page_to_mfn(q));
+        if ( (bmfn >= mfn) && ((bmfn - mfn) < SUPERPAGE_PAGES) )
+        {
+            unsigned long i;
+            page_list_del(q, &p2m->pod.super);
+            for ( i = 0; i < SUPERPAGE_PAGES; i++)
+            {
+                q = mfn_to_page(_mfn(mfn + i));
+                page_list_add_tail(q, &p2m->pod.single);
+            }
+            page_list_del(p, &p2m->pod.single);
+            p2m->pod.count--;
+            goto pod_hit;
+        }
+    }
+
+    page_list_for_each_safe(q, tmp, &p2m->pod.single)
+    {
+        mfn = mfn_x(page_to_mfn(q));
+        if ( mfn == bmfn )
+        {
+            page_list_del(p, &p2m->pod.single);
+            p2m->pod.count--;
+            goto pod_hit;
+        }
+    }
+
+    spin_unlock(&d->page_alloc_lock);
+    return 0;
+
+pod_hit:
+    page_list_add_tail(p, &d->arch.relmem_list);
+    spin_unlock(&d->page_alloc_lock);
+    return 1;
+}
+
+void
+p2m_pod_offline_or_broken_replace(struct page_info *p)
+{
+    struct domain *d;
+    struct p2m_domain *p2m;
+
+    if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) )
+        return;
+
+    free_domheap_page(p);
+
+    p = alloc_domheap_page(d, 0);
+    if ( unlikely(!p) )
+        return;
+
+    p2m_lock(p2m);
+    p2m_pod_cache_add(p2m, p, 0);
+    p2m_unlock(p2m);
+    return;
+}
+
+/* This function is needed for two reasons:
+ * + To properly handle clearing of PoD entries
+ * + To "steal back" memory being freed for the PoD cache, rather than
+ *   releasing it.
+ *
+ * Once both of these functions have been completed, we can return and
+ * allow decrease_reservation() to handle everything else.
+ */
+int
+p2m_pod_decrease_reservation(struct domain *d,
+                             xen_pfn_t gpfn,
+                             unsigned int order)
+{
+    int ret=0;
+    int i;
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    int steal_for_cache = 0;
+    int pod = 0, nonpod = 0, ram = 0;
+    
+
+    /* If we don't have any outstanding PoD entries, let things take their
+     * course */
+    if ( p2m->pod.entry_count == 0 )
+        goto out;
+
+    /* Figure out if we need to steal some freed memory for our cache */
+    steal_for_cache =  ( p2m->pod.entry_count > p2m->pod.count );
+
+    p2m_lock(p2m);
+    audit_p2m(p2m, 1);
+
+    if ( unlikely(d->is_dying) )
+        goto out_unlock;
+
+    /* See what's in here. */
+    /* FIXME: Add contiguous; query for PSE entries? */
+    for ( i=0; i<(1<<order); i++)
+    {
+        p2m_type_t t;
+
+        gfn_to_mfn_query(p2m, gpfn + i, &t);
+
+        if ( t == p2m_populate_on_demand )
+            pod++;
+        else
+        {
+            nonpod++;
+            if ( p2m_is_ram(t) )
+                ram++;
+        }
+    }
+
+    /* No populate-on-demand?  Don't need to steal anything?  Then we're 
done!*/
+    if(!pod && !steal_for_cache)
+        goto out_unlock;
+
+    if ( !nonpod )
+    {
+        /* All PoD: Mark the whole region invalid and tell caller
+         * we're done. */
+        set_p2m_entry(p2m, gpfn, _mfn(INVALID_MFN), order, p2m_invalid, 
p2m->default_access);
+        p2m->pod.entry_count-=(1<<order); /* Lock: p2m */
+        BUG_ON(p2m->pod.entry_count < 0);
+        ret = 1;
+        goto out_entry_check;
+    }
+
+    /* FIXME: Steal contig 2-meg regions for cache */
+
+    /* Process as long as:
+     * + There are PoD entries to handle, or
+     * + There is ram left, and we want to steal it
+     */
+    for ( i=0;
+          i<(1<<order) && (pod>0 || (steal_for_cache && ram > 0));
+          i++)
+    {
+        mfn_t mfn;
+        p2m_type_t t;
+
+        mfn = gfn_to_mfn_query(p2m, gpfn + i, &t);
+        if ( t == p2m_populate_on_demand )
+        {
+            set_p2m_entry(p2m, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid, 
p2m->default_access);
+            p2m->pod.entry_count--; /* Lock: p2m */
+            BUG_ON(p2m->pod.entry_count < 0);
+            pod--;
+        }
+        else if ( steal_for_cache && p2m_is_ram(t) )
+        {
+            struct page_info *page;
+
+            ASSERT(mfn_valid(mfn));
+
+            page = mfn_to_page(mfn);
+
+            set_p2m_entry(p2m, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid, 
p2m->default_access);
+            set_gpfn_from_mfn(mfn_x(mfn), INVALID_M2P_ENTRY);
+
+            p2m_pod_cache_add(p2m, page, 0);
+
+            steal_for_cache =  ( p2m->pod.entry_count > p2m->pod.count );
+
+            nonpod--;
+            ram--;
+        }
+    }    
+
+    /* If there are no more non-PoD entries, tell decrease_reservation() that
+     * there's nothing left to do. */
+    if ( nonpod == 0 )
+        ret = 1;
+
+out_entry_check:
+    /* If we've reduced our "liabilities" beyond our "assets", free some */
+    if ( p2m->pod.entry_count < p2m->pod.count )
+    {
+        p2m_pod_set_cache_target(p2m, p2m->pod.entry_count, 0/*can't 
preempt*/);
+    }
+
+out_unlock:
+    audit_p2m(p2m, 1);
+    p2m_unlock(p2m);
+
+out:
+    return ret;
+}
+
+void
+p2m_pod_dump_data(struct p2m_domain *p2m)
+{
+    printk("    PoD entries=%d cachesize=%d\n",
+           p2m->pod.entry_count, p2m->pod.count);
+}
+
+
+/* Search for all-zero superpages to be reclaimed as superpages for the
+ * PoD cache. Must be called w/ p2m lock held, page_alloc lock not held. */
+static int
+p2m_pod_zero_check_superpage(struct p2m_domain *p2m, unsigned long gfn)
+{
+    mfn_t mfn, mfn0 = _mfn(INVALID_MFN);
+    p2m_type_t type, type0 = 0;
+    unsigned long * map = NULL;
+    int ret=0, reset = 0;
+    int i, j;
+    int max_ref = 1;
+    struct domain *d = p2m->domain;
+
+    if ( !superpage_aligned(gfn) )
+        goto out;
+
+    /* Allow an extra refcount for one shadow pt mapping in shadowed domains */
+    if ( paging_mode_shadow(d) )
+        max_ref++;
+
+    /* Look up the mfns, checking to make sure they're the same mfn
+     * and aligned, and mapping them. */
+    for ( i=0; i<SUPERPAGE_PAGES; i++ )
+    {
+        
+        mfn = gfn_to_mfn_query(p2m, gfn + i, &type);
+
+        if ( i == 0 )
+        {
+            mfn0 = mfn;
+            type0 = type;
+        }
+
+        /* Conditions that must be met for superpage-superpage:
+         * + All gfns are ram types
+         * + All gfns have the same type
+         * + All of the mfns are allocated to a domain
+         * + None of the mfns are used as pagetables, or allocated via xenheap
+         * + The first mfn is 2-meg aligned
+         * + All the other mfns are in sequence
+         * Adding for good measure:
+         * + None of the mfns are likely to be mapped elsewhere (refcount
+         *   2 or less for shadow, 1 for hap)
+         */
+        if ( !p2m_is_ram(type)
+             || type != type0
+             || ( (mfn_to_page(mfn)->count_info & PGC_allocated) == 0 )
+             || ( (mfn_to_page(mfn)->count_info & 
(PGC_page_table|PGC_xen_heap)) != 0 )
+             || ( (mfn_to_page(mfn)->count_info & PGC_xen_heap  ) != 0 )
+             || ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > max_ref )
+             || !( ( i == 0 && superpage_aligned(mfn_x(mfn0)) )
+                   || ( i != 0 && mfn_x(mfn) == (mfn_x(mfn0) + i) ) ) )
+            goto out;
+    }
+
+    /* Now, do a quick check to see if it may be zero before unmapping. */
+    for ( i=0; i<SUPERPAGE_PAGES; i++ )
+    {
+        /* Quick zero-check */
+        map = map_domain_page(mfn_x(mfn0) + i);
+
+        for ( j=0; j<16; j++ )
+            if( *(map+j) != 0 )
+                break;
+
+        unmap_domain_page(map);
+
+        if ( j < 16 )
+            goto out;
+
+    }
+
+    /* Try to remove the page, restoring old mapping if it fails. */
+    set_p2m_entry(p2m, gfn,
+                  _mfn(POPULATE_ON_DEMAND_MFN), 9,
+                  p2m_populate_on_demand, p2m->default_access);
+
+    /* Make none of the MFNs are used elsewhere... for example, mapped
+     * via the grant table interface, or by qemu.  Allow one refcount for
+     * being allocated to the domain. */
+    for ( i=0; i < SUPERPAGE_PAGES; i++ )
+    {
+        mfn = _mfn(mfn_x(mfn0) + i);
+        if ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > 1 )
+        {
+            reset = 1;
+            goto out_reset;
+        }
+    }
+
+    /* Finally, do a full zero-check */
+    for ( i=0; i < SUPERPAGE_PAGES; i++ )
+    {
+        map = map_domain_page(mfn_x(mfn0) + i);
+
+        for ( j=0; j<PAGE_SIZE/sizeof(*map); j++ )
+            if( *(map+j) != 0 )
+            {
+                reset = 1;
+                break;
+            }
+
+        unmap_domain_page(map);
+
+        if ( reset )
+            goto out_reset;
+    }
+
+    if ( tb_init_done )
+    {
+        struct {
+            u64 gfn, mfn;
+            int d:16,order:16;
+        } t;
+
+        t.gfn = gfn;
+        t.mfn = mfn_x(mfn);
+        t.d = d->domain_id;
+        t.order = 9;
+
+        __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t);
+    }
+
+    /* Finally!  We've passed all the checks, and can add the mfn superpage
+     * back on the PoD cache, and account for the new p2m PoD entries */
+    p2m_pod_cache_add(p2m, mfn_to_page(mfn0), 9);
+    p2m->pod.entry_count += SUPERPAGE_PAGES;
+
+out_reset:
+    if ( reset )
+        set_p2m_entry(p2m, gfn, mfn0, 9, type0, p2m->default_access);
+    
+out:
+    return ret;
+}
+
+static void
+p2m_pod_zero_check(struct p2m_domain *p2m, unsigned long *gfns, int count)
+{
+    mfn_t mfns[count];
+    p2m_type_t types[count];
+    unsigned long * map[count];
+    struct domain *d = p2m->domain;
+
+    int i, j;
+    int max_ref = 1;
+
+    /* Allow an extra refcount for one shadow pt mapping in shadowed domains */
+    if ( paging_mode_shadow(d) )
+        max_ref++;
+
+    /* First, get the gfn list, translate to mfns, and map the pages. */
+    for ( i=0; i<count; i++ )
+    {
+        mfns[i] = gfn_to_mfn_query(p2m, gfns[i], types + i);
+        /* If this is ram, and not a pagetable or from the xen heap, and 
probably not mapped
+           elsewhere, map it; otherwise, skip. */
+        if ( p2m_is_ram(types[i])
+             && ( (mfn_to_page(mfns[i])->count_info & PGC_allocated) != 0 ) 
+             && ( (mfn_to_page(mfns[i])->count_info & 
(PGC_page_table|PGC_xen_heap)) == 0 ) 
+             && ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) <= 
max_ref ) )
+            map[i] = map_domain_page(mfn_x(mfns[i]));
+        else
+            map[i] = NULL;
+    }
+
+    /* Then, go through and check for zeroed pages, removing write permission
+     * for those with zeroes. */
+    for ( i=0; i<count; i++ )
+    {
+        if(!map[i])
+            continue;
+
+        /* Quick zero-check */
+        for ( j=0; j<16; j++ )
+            if( *(map[i]+j) != 0 )
+                break;
+
+        if ( j < 16 )
+        {
+            unmap_domain_page(map[i]);
+            map[i] = NULL;
+            continue;
+        }
+
+        /* Try to remove the page, restoring old mapping if it fails. */
+        set_p2m_entry(p2m, gfns[i],
+                      _mfn(POPULATE_ON_DEMAND_MFN), 0,
+                      p2m_populate_on_demand, p2m->default_access);
+
+        /* See if the page was successfully unmapped.  (Allow one refcount
+         * for being allocated to a domain.) */
+        if ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) > 1 )
+        {
+            unmap_domain_page(map[i]);
+            map[i] = NULL;
+
+            set_p2m_entry(p2m, gfns[i], mfns[i], 0, types[i], 
p2m->default_access);
+
+            continue;
+        }
+    }
+
+    /* Now check each page for real */
+    for ( i=0; i < count; i++ )
+    {
+        if(!map[i])
+            continue;
+
+        for ( j=0; j<PAGE_SIZE/sizeof(*map[i]); j++ )
+            if( *(map[i]+j) != 0 )
+                break;
+
+        unmap_domain_page(map[i]);
+
+        /* See comment in p2m_pod_zero_check_superpage() re gnttab
+         * check timing.  */
+        if ( j < PAGE_SIZE/sizeof(*map[i]) )
+        {
+            set_p2m_entry(p2m, gfns[i], mfns[i], 0, types[i], 
p2m->default_access);
+        }
+        else
+        {
+            if ( tb_init_done )
+            {
+                struct {
+                    u64 gfn, mfn;
+                    int d:16,order:16;
+                } t;
+
+                t.gfn = gfns[i];
+                t.mfn = mfn_x(mfns[i]);
+                t.d = d->domain_id;
+                t.order = 0;
+        
+                __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t);
+            }
+
+            /* Add to cache, and account for the new p2m PoD entry */
+            p2m_pod_cache_add(p2m, mfn_to_page(mfns[i]), 0);
+            p2m->pod.entry_count++;
+        }
+    }
+    
+}
+
+#define POD_SWEEP_LIMIT 1024
+static void
+p2m_pod_emergency_sweep_super(struct p2m_domain *p2m)
+{
+    unsigned long i, start, limit;
+
+    if ( p2m->pod.reclaim_super == 0 )
+    {
+        p2m->pod.reclaim_super = (p2m->pod.max_guest>>9)<<9;
+        p2m->pod.reclaim_super -= SUPERPAGE_PAGES;
+    }
+    
+    start = p2m->pod.reclaim_super;
+    limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0;
+
+    for ( i=p2m->pod.reclaim_super ; i > 0 ; i -= SUPERPAGE_PAGES )
+    {
+        p2m_pod_zero_check_superpage(p2m, i);
+        /* Stop if we're past our limit and we have found *something*.
+         *
+         * NB that this is a zero-sum game; we're increasing our cache size
+         * by increasing our 'debt'.  Since we hold the p2m lock,
+         * (entry_count - count) must remain the same. */
+        if ( !page_list_empty(&p2m->pod.super) &&  i < limit )
+            break;
+    }
+
+    p2m->pod.reclaim_super = i ? i - SUPERPAGE_PAGES : 0;
+}
+
+#define POD_SWEEP_STRIDE  16
+static void
+p2m_pod_emergency_sweep(struct p2m_domain *p2m)
+{
+    unsigned long gfns[POD_SWEEP_STRIDE];
+    unsigned long i, j=0, start, limit;
+    p2m_type_t t;
+
+
+    if ( p2m->pod.reclaim_single == 0 )
+        p2m->pod.reclaim_single = p2m->pod.max_guest;
+
+    start = p2m->pod.reclaim_single;
+    limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0;
+
+    /* FIXME: Figure out how to avoid superpages */
+    for ( i=p2m->pod.reclaim_single; i > 0 ; i-- )
+    {
+        gfn_to_mfn_query(p2m, i, &t );
+        if ( p2m_is_ram(t) )
+        {
+            gfns[j] = i;
+            j++;
+            BUG_ON(j > POD_SWEEP_STRIDE);
+            if ( j == POD_SWEEP_STRIDE )
+            {
+                p2m_pod_zero_check(p2m, gfns, j);
+                j = 0;
+            }
+        }
+        /* Stop if we're past our limit and we have found *something*.
+         *
+         * NB that this is a zero-sum game; we're increasing our cache size
+         * by re-increasing our 'debt'.  Since we hold the p2m lock,
+         * (entry_count - count) must remain the same. */
+        if ( p2m->pod.count > 0 && i < limit )
+            break;
+    }
+
+    if ( j )
+        p2m_pod_zero_check(p2m, gfns, j);
+
+    p2m->pod.reclaim_single = i ? i - 1 : i;
+
+}
+
+int
+p2m_pod_demand_populate(struct p2m_domain *p2m, unsigned long gfn,
+                        unsigned int order,
+                        p2m_query_t q)
+{
+    struct domain *d = p2m->domain;
+    struct page_info *p = NULL; /* Compiler warnings */
+    unsigned long gfn_aligned;
+    mfn_t mfn;
+    int i;
+
+    ASSERT(p2m_locked_by_me(p2m));
+
+    /* This check is done with the p2m lock held.  This will make sure that
+     * even if d->is_dying changes under our feet, p2m_pod_empty_cache() 
+     * won't start until we're done. */
+    if ( unlikely(d->is_dying) )
+        goto out_fail;
+
+    /* Because PoD does not have cache list for 1GB pages, it has to remap
+     * 1GB region to 2MB chunks for a retry. */
+    if ( order == 18 )
+    {
+        gfn_aligned = (gfn >> order) << order;
+        /* Note that we are supposed to call set_p2m_entry() 512 times to 
+         * split 1GB into 512 2MB pages here. But We only do once here because
+         * set_p2m_entry() should automatically shatter the 1GB page into 
+         * 512 2MB pages. The rest of 511 calls are unnecessary.
+         */
+        set_p2m_entry(p2m, gfn_aligned, _mfn(POPULATE_ON_DEMAND_MFN), 9,
+                      p2m_populate_on_demand, p2m->default_access);
+        audit_p2m(p2m, 1);
+        p2m_unlock(p2m);
+        return 0;
+    }
+
+    /* Once we've ballooned down enough that we can fill the remaining
+     * PoD entries from the cache, don't sweep even if the particular
+     * list we want to use is empty: that can lead to thrashing zero pages 
+     * through the cache for no good reason.  */
+    if ( p2m->pod.entry_count > p2m->pod.count )
+    {
+
+        /* If we're low, start a sweep */
+        if ( order == 9 && page_list_empty(&p2m->pod.super) )
+            p2m_pod_emergency_sweep_super(p2m);
+
+        if ( page_list_empty(&p2m->pod.single) &&
+             ( ( order == 0 )
+               || (order == 9 && page_list_empty(&p2m->pod.super) ) ) )
+            p2m_pod_emergency_sweep(p2m);
+    }
+
+    /* Keep track of the highest gfn demand-populated by a guest fault */
+    if ( q == p2m_guest && gfn > p2m->pod.max_guest )
+        p2m->pod.max_guest = gfn;
+
+    spin_lock(&d->page_alloc_lock);
+
+    if ( p2m->pod.count == 0 )
+        goto out_of_memory;
+
+    /* Get a page f/ the cache.  A NULL return value indicates that the
+     * 2-meg range should be marked singleton PoD, and retried */
+    if ( (p = p2m_pod_cache_get(p2m, order)) == NULL )
+        goto remap_and_retry;
+
+    mfn = page_to_mfn(p);
+
+    BUG_ON((mfn_x(mfn) & ((1 << order)-1)) != 0);
+
+    spin_unlock(&d->page_alloc_lock);
+
+    gfn_aligned = (gfn >> order) << order;
+
+    set_p2m_entry(p2m, gfn_aligned, mfn, order, p2m_ram_rw, 
p2m->default_access);
+
+    for( i = 0; i < (1UL << order); i++ )
+    {
+        set_gpfn_from_mfn(mfn_x(mfn) + i, gfn_aligned + i);
+        paging_mark_dirty(d, mfn_x(mfn) + i);
+    }
+    
+    p2m->pod.entry_count -= (1 << order); /* Lock: p2m */
+    BUG_ON(p2m->pod.entry_count < 0);
+
+    if ( tb_init_done )
+    {
+        struct {
+            u64 gfn, mfn;
+            int d:16,order:16;
+        } t;
+
+        t.gfn = gfn;
+        t.mfn = mfn_x(mfn);
+        t.d = d->domain_id;
+        t.order = order;
+        
+        __trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t);
+    }
+
+    return 0;
+out_of_memory:
+    spin_unlock(&d->page_alloc_lock);
+
+    printk("%s: Out of populate-on-demand memory! tot_pages %" PRIu32 " 
pod_entries %" PRIi32 "\n",
+           __func__, d->tot_pages, p2m->pod.entry_count);
+    domain_crash(d);
+out_fail:
+    return -1;
+remap_and_retry:
+    BUG_ON(order != 9);
+    spin_unlock(&d->page_alloc_lock);
+
+    /* Remap this 2-meg region in singleton chunks */
+    gfn_aligned = (gfn>>order)<<order;
+    for(i=0; i<(1<<order); i++)
+        set_p2m_entry(p2m, gfn_aligned+i, _mfn(POPULATE_ON_DEMAND_MFN), 0,
+                      p2m_populate_on_demand, p2m->default_access);
+    if ( tb_init_done )
+    {
+        struct {
+            u64 gfn;
+            int d:16;
+        } t;
+
+        t.gfn = gfn;
+        t.d = d->domain_id;
+        
+        __trace_var(TRC_MEM_POD_SUPERPAGE_SPLINTER, 0, sizeof(t), &t);
+    }
+
+    return 0;
+}
+
+
+int
+guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
+                                      unsigned int order)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    unsigned long i;
+    p2m_type_t ot;
+    mfn_t omfn;
+    int pod_count = 0;
+    int rc = 0;
+
+    BUG_ON(!paging_mode_translate(d));
+
+    rc = p2m_gfn_check_limit(d, gfn, order);
+    if ( rc != 0 )
+        return rc;
+
+    p2m_lock(p2m);
+    audit_p2m(p2m, 1);
+
+    P2M_DEBUG("mark pod gfn=%#lx\n", gfn);
+
+    /* Make sure all gpfns are unused */
+    for ( i = 0; i < (1UL << order); i++ )
+    {
+        omfn = gfn_to_mfn_query(p2m, gfn + i, &ot);
+        if ( p2m_is_ram(ot) )
+        {
+            printk("%s: gfn_to_mfn returned type %d!\n",
+                   __func__, ot);
+            rc = -EBUSY;
+            goto out;
+        }
+        else if ( ot == p2m_populate_on_demand )
+        {
+            /* Count how man PoD entries we'll be replacing if successful */
+            pod_count++;
+        }
+    }
+
+    /* Now, actually do the two-way mapping */
+    if ( !set_p2m_entry(p2m, gfn, _mfn(POPULATE_ON_DEMAND_MFN), order,
+                        p2m_populate_on_demand, p2m->default_access) )
+        rc = -EINVAL;
+    else
+    {
+        p2m->pod.entry_count += 1 << order; /* Lock: p2m */
+        p2m->pod.entry_count -= pod_count;
+        BUG_ON(p2m->pod.entry_count < 0);
+    }
+
+    audit_p2m(p2m, 1);
+    p2m_unlock(p2m);
+
+out:
+    return rc;
+}
+
diff -r 4b0692880dfa -r 26c4beb6b520 xen/arch/x86/mm/p2m-pt.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/mm/p2m-pt.c  Fri May 06 11:15:35 2011 +0100
@@ -0,0 +1,1301 @@
+/******************************************************************************
+ * arch/x86/mm/p2m-pt.c
+ *
+ * Implementation of p2m datastructures as pagetables, for use by 
+ * NPT and shadow-pagetable code
+ *
+ * Parts of this code are Copyright (c) 2009-2011 by Citrix Systems, Inc.
+ * Parts of this code are Copyright (c) 2007 by Advanced Micro Devices.
+ * Parts of this code are Copyright (c) 2006-2007 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <asm/domain.h>
+#include <asm/page.h>
+#include <asm/paging.h>
+#include <asm/p2m.h>
+#include <xen/iommu.h>
+#include <asm/mem_event.h>
+#include <public/mem_event.h>
+#include <asm/mem_sharing.h>
+#include <xen/event.h>
+#include <xen/trace.h>
+#include <asm/hvm/nestedhvm.h>
+#include <asm/hvm/svm/amd-iommu-proto.h>
+
+/* Debugging and auditing of the P2M code? */
+#define P2M_AUDIT     0
+#define P2M_DEBUGGING 0
+
+/* Printouts */
+#define P2M_PRINTK(_f, _a...)                                \
+    debugtrace_printk("p2m: %s(): " _f, __func__, ##_a)
+#define P2M_ERROR(_f, _a...)                                 \
+    printk("pg error: %s(): " _f, __func__, ##_a)
+#if P2M_DEBUGGING
+#define P2M_DEBUG(_f, _a...)                                 \
+    debugtrace_printk("p2mdebug: %s(): " _f, __func__, ##_a)
+#else
+#define P2M_DEBUG(_f, _a...) do { (void)(_f); } while(0)
+#endif
+
+
+/* Override macros from asm/page.h to make them work with mfn_t */
+#undef mfn_to_page
+#define mfn_to_page(_m) __mfn_to_page(mfn_x(_m))
+#undef mfn_valid
+#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
+#undef page_to_mfn
+#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
+
+
+/* PTE flags for the various types of p2m entry */
+#define P2M_BASE_FLAGS \
+        (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED)
+
+#define SUPERPAGE_PAGES (1UL << 9)
+#define superpage_aligned(_x)  (((_x)&(SUPERPAGE_PAGES-1))==0)
+
+unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
+{
+    unsigned long flags;
+#ifdef __x86_64__
+    /*
+     * AMD IOMMU: When we share p2m table with iommu, bit 9 - bit 11 will be
+     * used for iommu hardware to encode next io page level. Bit 59 - bit 62
+     * are used for iommu flags, We could not use these bits to store p2m 
types.
+     */
+    flags = (unsigned long)(t & 0x7f) << 12;
+#else
+    flags = (t & 0x7UL) << 9;
+#endif
+#ifndef HAVE_GRANT_MAP_P2M
+    BUG_ON(p2m_is_grant(t));
+#endif
+    switch(t)
+    {
+    case p2m_invalid:
+    default:
+        return flags;
+    case p2m_ram_rw:
+    case p2m_grant_map_rw:
+        return flags | P2M_BASE_FLAGS | _PAGE_RW;
+    case p2m_ram_logdirty:
+        return flags | P2M_BASE_FLAGS;
+    case p2m_ram_ro:
+    case p2m_grant_map_ro:
+        return flags | P2M_BASE_FLAGS;
+    case p2m_ram_shared:
+        return flags | P2M_BASE_FLAGS;
+    case p2m_mmio_dm:
+        return flags;
+    case p2m_mmio_direct:
+        if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) )
+            flags |= _PAGE_RW;
+        return flags | P2M_BASE_FLAGS | _PAGE_PCD;
+    case p2m_populate_on_demand:
+        return flags;
+    }
+}
+
+#if P2M_AUDIT
+void audit_p2m(struct p2m_domain *p2m, int strict_m2p);
+#else
+# define audit_p2m(_p2m, _m2p) do { (void)(_p2m),(_m2p); } while (0)
+#endif /* P2M_AUDIT */
+
+// Find the next level's P2M entry, checking for out-of-range gfn's...
+// Returns NULL on error.
+//
+l1_pgentry_t *
+p2m_find_entry(void *table, unsigned long *gfn_remainder,
+                   unsigned long gfn, uint32_t shift, uint32_t max)
+{
+    u32 index;
+
+    index = *gfn_remainder >> shift;
+    if ( index >= max )
+    {
+        P2M_DEBUG("gfn=0x%lx out of range "
+                  "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
+                  gfn, *gfn_remainder, shift, index, max);
+        return NULL;
+    }
+    *gfn_remainder &= (1 << shift) - 1;
+    return (l1_pgentry_t *)table + index;
+}
+
+struct page_info *
+p2m_alloc_ptp(struct p2m_domain *p2m, unsigned long type)
+{
+    struct page_info *pg;
+
+    ASSERT(p2m);
+    ASSERT(p2m->domain);
+    ASSERT(p2m->domain->arch.paging.alloc_page);
+    pg = p2m->domain->arch.paging.alloc_page(p2m->domain);
+    if (pg == NULL)
+        return NULL;
+
+    page_list_add_tail(pg, &p2m->pages);
+    pg->u.inuse.type_info = type | 1 | PGT_validated;
+
+    return pg;
+}
+
+void
+p2m_free_ptp(struct p2m_domain *p2m, struct page_info *pg)
+{
+    ASSERT(pg);
+    ASSERT(p2m);
+    ASSERT(p2m->domain);
+    ASSERT(p2m->domain->arch.paging.free_page);
+
+    page_list_del(pg, &p2m->pages);
+    p2m->domain->arch.paging.free_page(p2m->domain, pg);
+
+    return;
+}
+
+/* Free intermediate tables from a p2m sub-tree */
+void
+p2m_free_entry(struct p2m_domain *p2m, l1_pgentry_t *p2m_entry, int page_order)
+{
+    /* End if the entry is a leaf entry. */
+    if ( page_order == 0
+         || !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT)
+         || (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+        return;
+
+    if ( page_order > 9 )
+    {
+        l1_pgentry_t *l3_table = map_domain_page(l1e_get_pfn(*p2m_entry));
+        for ( int i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
+            p2m_free_entry(p2m, l3_table + i, page_order - 9);
+        unmap_domain_page(l3_table);
+    }
+
+    p2m_free_ptp(p2m, mfn_to_page(_mfn(l1e_get_pfn(*p2m_entry))));
+}
+
+// Walk one level of the P2M table, allocating a new table if required.
+// Returns 0 on error.
+//
+
+/* AMD IOMMU: Convert next level bits and r/w bits into 24 bits p2m flags */
+#define iommu_nlevel_to_flags(nl, f) ((((nl) & 0x7) << 9 )|(((f) & 0x3) << 21))
+
+static void p2m_add_iommu_flags(l1_pgentry_t *p2m_entry,
+                                unsigned int nlevel, unsigned int flags)
+{
+#if CONFIG_PAGING_LEVELS == 4
+    if ( iommu_hap_pt_share )
+        l1e_add_flags(*p2m_entry, iommu_nlevel_to_flags(nlevel, flags));
+#endif
+}
+
+static int
+p2m_next_level(struct p2m_domain *p2m, mfn_t *table_mfn, void **table,
+               unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
+               u32 max, unsigned long type)
+{
+    l1_pgentry_t *l1_entry;
+    l1_pgentry_t *p2m_entry;
+    l1_pgentry_t new_entry;
+    void *next;
+    int i;
+
+    if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
+                                      shift, max)) )
+        return 0;
+
+    /* PoD: Not present doesn't imply empty. */
+    if ( !l1e_get_flags(*p2m_entry) )
+    {
+        struct page_info *pg;
+
+        pg = p2m_alloc_ptp(p2m, type);
+        if ( pg == NULL )
+            return 0;
+
+        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
+                                 __PAGE_HYPERVISOR | _PAGE_USER);
+
+        switch ( type ) {
+        case PGT_l3_page_table:
+            p2m_add_iommu_flags(&new_entry, 3, 
IOMMUF_readable|IOMMUF_writable);
+            p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 
4);
+            break;
+        case PGT_l2_page_table:
+#if CONFIG_PAGING_LEVELS == 3
+            /* for PAE mode, PDPE only has PCD/PWT/P bits available */
+            new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), _PAGE_PRESENT);
+#endif
+            p2m_add_iommu_flags(&new_entry, 2, 
IOMMUF_readable|IOMMUF_writable);
+            p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 
3);
+            break;
+        case PGT_l1_page_table:
+            p2m_add_iommu_flags(&new_entry, 1, 
IOMMUF_readable|IOMMUF_writable);
+            p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 
2);
+            break;
+        default:
+            BUG();
+            break;
+        }
+    }
+
+    ASSERT(l1e_get_flags(*p2m_entry) & (_PAGE_PRESENT|_PAGE_PSE));
+
+    /* split 1GB pages into 2MB pages */
+    if ( type == PGT_l2_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+    {
+        unsigned long flags, pfn;
+        struct page_info *pg;
+
+        pg = p2m_alloc_ptp(p2m, PGT_l2_page_table);
+        if ( pg == NULL )
+            return 0;
+
+        flags = l1e_get_flags(*p2m_entry);
+        pfn = l1e_get_pfn(*p2m_entry);
+
+        l1_entry = map_domain_page(mfn_x(page_to_mfn(pg)));
+        for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
+        {
+            new_entry = l1e_from_pfn(pfn + (i * L1_PAGETABLE_ENTRIES), flags);
+            p2m_add_iommu_flags(&new_entry, 1, 
IOMMUF_readable|IOMMUF_writable);
+            p2m->write_p2m_entry(p2m, gfn,
+                l1_entry+i, *table_mfn, new_entry, 2);
+        }
+        unmap_domain_page(l1_entry);
+        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
+                                 __PAGE_HYPERVISOR|_PAGE_USER); //disable PSE
+        p2m_add_iommu_flags(&new_entry, 2, IOMMUF_readable|IOMMUF_writable);
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3);
+    }
+
+
+    /* split single 2MB large page into 4KB page in P2M table */
+    if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+    {
+        unsigned long flags, pfn;
+        struct page_info *pg;
+
+        pg = p2m_alloc_ptp(p2m, PGT_l1_page_table);
+        if ( pg == NULL )
+            return 0;
+
+        /* New splintered mappings inherit the flags of the old superpage, 
+         * with a little reorganisation for the _PAGE_PSE_PAT bit. */
+        flags = l1e_get_flags(*p2m_entry);
+        pfn = l1e_get_pfn(*p2m_entry);
+        if ( pfn & 1 )           /* ==> _PAGE_PSE_PAT was set */
+            pfn -= 1;            /* Clear it; _PAGE_PSE becomes _PAGE_PAT */
+        else
+            flags &= ~_PAGE_PSE; /* Clear _PAGE_PSE (== _PAGE_PAT) */
+        
+        l1_entry = __map_domain_page(pg);
+        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+        {
+            new_entry = l1e_from_pfn(pfn + i, flags);
+            p2m_add_iommu_flags(&new_entry, 0, 0);
+            p2m->write_p2m_entry(p2m, gfn,
+                l1_entry+i, *table_mfn, new_entry, 1);
+        }
+        unmap_domain_page(l1_entry);
+        
+        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
+                                 __PAGE_HYPERVISOR|_PAGE_USER);
+        p2m_add_iommu_flags(&new_entry, 1, IOMMUF_readable|IOMMUF_writable);
+        p2m->write_p2m_entry(p2m, gfn,
+            p2m_entry, *table_mfn, new_entry, 2);
+    }
+
+    *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
+    next = map_domain_page(mfn_x(*table_mfn));
+    unmap_domain_page(*table);
+    *table = next;
+
+    return 1;
+}
+
+// Returns 0 on error (out of memory)
+static int
+p2m_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, 
+              unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma)
+{
+    // XXX -- this might be able to be faster iff current->domain == d
+    mfn_t table_mfn = pagetable_get_mfn(p2m_get_pagetable(p2m));
+    void *table =map_domain_page(mfn_x(table_mfn));
+    unsigned long i, gfn_remainder = gfn;
+    l1_pgentry_t *p2m_entry;
+    l1_pgentry_t entry_content;
+    l2_pgentry_t l2e_content;
+    l3_pgentry_t l3e_content;
+    int rv=0;
+    unsigned int iommu_pte_flags = (p2mt == p2m_ram_rw) ?
+                                   IOMMUF_readable|IOMMUF_writable:
+                                   0; 
+    unsigned long old_mfn = 0;
+
+    if ( tb_init_done )
+    {
+        struct {
+            u64 gfn, mfn;
+            int p2mt;
+            int d:16,order:16;
+        } t;
+
+        t.gfn = gfn;
+        t.mfn = mfn_x(mfn);
+        t.p2mt = p2mt;
+        t.d = p2m->domain->domain_id;
+        t.order = page_order;
+
+        __trace_var(TRC_MEM_SET_P2M_ENTRY, 0, sizeof(t), &t);
+    }
+
+#if CONFIG_PAGING_LEVELS >= 4
+    if ( !p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn,
+                         L4_PAGETABLE_SHIFT - PAGE_SHIFT,
+                         L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
+        goto out;
+#endif
+    /*
+     * Try to allocate 1GB page table if this feature is supported.
+     */
+    if ( page_order == 18 )
+    {
+        l1_pgentry_t old_entry = l1e_empty();
+        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+                                   L3_PAGETABLE_SHIFT - PAGE_SHIFT,
+                                   L3_PAGETABLE_ENTRIES);
+        ASSERT(p2m_entry);
+        if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
+             !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+        {
+            /* We're replacing a non-SP page with a superpage.  Make sure to
+             * handle freeing the table properly. */
+            old_entry = *p2m_entry;
+        }
+
+        ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
+        l3e_content = mfn_valid(mfn) 
+            ? l3e_from_pfn(mfn_x(mfn),
+                           p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE)
+            : l3e_empty();
+        entry_content.l1 = l3e_content.l3;
+
+        if ( entry_content.l1 != 0 )
+        {
+            p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
+            old_mfn = l1e_get_pfn(*p2m_entry);
+        }
+
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 3);
+        /* NB: paging_write_p2m_entry() handles tlb flushes properly */
+
+        /* Free old intermediate tables if necessary */
+        if ( l1e_get_flags(old_entry) & _PAGE_PRESENT )
+            p2m_free_entry(p2m, &old_entry, page_order);
+    }
+    /*
+     * When using PAE Xen, we only allow 33 bits of pseudo-physical
+     * address in translated guests (i.e. 8 GBytes).  This restriction
+     * comes from wanting to map the P2M table into the 16MB RO_MPT hole
+     * in Xen's address space for translated PV guests.
+     * When using AMD's NPT on PAE Xen, we are restricted to 4GB.
+     */
+    else if ( !p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn,
+                              L3_PAGETABLE_SHIFT - PAGE_SHIFT,
+                              ((CONFIG_PAGING_LEVELS == 3)
+                               ? (hap_enabled(p2m->domain) ? 4 : 8)
+                               : L3_PAGETABLE_ENTRIES),
+                              PGT_l2_page_table) )
+        goto out;
+
+    if ( page_order == 0 )
+    {
+        if ( !p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn,
+                             L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+                             L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
+            goto out;
+
+        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+                                   0, L1_PAGETABLE_ENTRIES);
+        ASSERT(p2m_entry);
+        
+        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
+            entry_content = l1e_from_pfn(mfn_x(mfn),
+                                         p2m_type_to_flags(p2mt, mfn));
+        else
+            entry_content = l1e_empty();
+
+        if ( entry_content.l1 != 0 )
+        {
+            p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
+            old_mfn = l1e_get_pfn(*p2m_entry);
+        }
+        /* level 1 entry */
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 1);
+        /* NB: paging_write_p2m_entry() handles tlb flushes properly */
+    }
+    else if ( page_order == 9 )
+    {
+        l1_pgentry_t old_entry = l1e_empty();
+        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+                                   L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+                                   L2_PAGETABLE_ENTRIES);
+        ASSERT(p2m_entry);
+        
+        /* FIXME: Deal with 4k replaced by 2meg pages */
+        if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
+             !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+        {
+            /* We're replacing a non-SP page with a superpage.  Make sure to
+             * handle freeing the table properly. */
+            old_entry = *p2m_entry;
+        }
+        
+        ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
+        if ( mfn_valid(mfn) || p2m_is_magic(p2mt) )
+            l2e_content = l2e_from_pfn(mfn_x(mfn),
+                                       p2m_type_to_flags(p2mt, mfn) |
+                                       _PAGE_PSE);
+        else
+            l2e_content = l2e_empty();
+        
+        entry_content.l1 = l2e_content.l2;
+
+        if ( entry_content.l1 != 0 )
+        {
+            p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
+            old_mfn = l1e_get_pfn(*p2m_entry);
+        }
+
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 2);
+        /* NB: paging_write_p2m_entry() handles tlb flushes properly */
+
+        /* Free old intermediate tables if necessary */
+        if ( l1e_get_flags(old_entry) & _PAGE_PRESENT )
+            p2m_free_entry(p2m, &old_entry, page_order);
+    }
+
+    /* Track the highest gfn for which we have ever had a valid mapping */
+    if ( mfn_valid(mfn) 
+         && (gfn + (1UL << page_order) - 1 > p2m->max_mapped_pfn) )
+        p2m->max_mapped_pfn = gfn + (1UL << page_order) - 1;
+
+    if ( iommu_enabled && need_iommu(p2m->domain) )
+    {
+        if ( iommu_hap_pt_share )
+        {
+            if ( old_mfn && (old_mfn != mfn_x(mfn)) )
+                amd_iommu_flush_pages(p2m->domain, gfn, page_order);
+        }
+        else
+        {
+            if ( p2mt == p2m_ram_rw )
+                for ( i = 0; i < (1UL << page_order); i++ )
+                    iommu_map_page(p2m->domain, gfn+i, mfn_x(mfn)+i,
+                                   IOMMUF_readable|IOMMUF_writable);
+            else
+                for ( int i = 0; i < (1UL << page_order); i++ )
+                    iommu_unmap_page(p2m->domain, gfn+i);
+        }
+    }
+
+    /* Success */
+    rv = 1;
+
+out:
+    unmap_domain_page(table);
+    return rv;
+}
+
+
+/* Non-ept "lock-and-check" wrapper */
+static int p2m_pod_check_and_populate(struct p2m_domain *p2m, unsigned long 
gfn,
+                                      l1_pgentry_t *p2m_entry, int order,
+                                      p2m_query_t q)
+{
+    /* Only take the lock if we don't already have it.  Otherwise it
+     * wouldn't be safe to do p2m lookups with the p2m lock held */
+    int do_locking = !p2m_locked_by_me(p2m);
+    int r;
+
+    if ( do_locking )
+        p2m_lock(p2m);
+
+    audit_p2m(p2m, 1);
+
+    /* Check to make sure this is still PoD */
+    if ( p2m_flags_to_type(l1e_get_flags(*p2m_entry)) != 
p2m_populate_on_demand )
+    {
+        if ( do_locking )
+            p2m_unlock(p2m);
+        return 0;
+    }
+
+    r = p2m_pod_demand_populate(p2m, gfn, order, q);
+
+    audit_p2m(p2m, 1);
+    if ( do_locking )
+        p2m_unlock(p2m);
+
+    return r;
+}
+
+
+static mfn_t
+p2m_gfn_to_mfn(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, 
p2m_access_t *a,
+               p2m_query_t q)
+{
+    mfn_t mfn;
+    paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
+    l2_pgentry_t *l2e;
+    l1_pgentry_t *l1e;
+
+    ASSERT(paging_mode_translate(p2m->domain));
+
+    /* XXX This is for compatibility with the old model, where anything not 
+     * XXX marked as RAM was considered to be emulated MMIO space.
+     * XXX Once we start explicitly registering MMIO regions in the p2m 
+     * XXX we will return p2m_invalid for unmapped gfns */
+    *t = p2m_mmio_dm;
+    /* Not implemented except with EPT */
+    *a = p2m_access_rwx; 
+
+    mfn = pagetable_get_mfn(p2m_get_pagetable(p2m));
+
+    if ( gfn > p2m->max_mapped_pfn )
+        /* This pfn is higher than the highest the p2m map currently holds */
+        return _mfn(INVALID_MFN);
+
+#if CONFIG_PAGING_LEVELS >= 4
+    {
+        l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn));
+        l4e += l4_table_offset(addr);
+        if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
+        {
+            unmap_domain_page(l4e);
+            return _mfn(INVALID_MFN);
+        }
+        mfn = _mfn(l4e_get_pfn(*l4e));
+        unmap_domain_page(l4e);
+    }
+#endif
+    {
+        l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn));
+#if CONFIG_PAGING_LEVELS == 3
+        /* On PAE hosts the p2m has eight l3 entries, not four (see
+         * shadow_set_p2m_entry()) so we can't use l3_table_offset.
+         * Instead, just count the number of l3es from zero.  It's safe
+         * to do this because we already checked that the gfn is within
+         * the bounds of the p2m. */
+        l3e += (addr >> L3_PAGETABLE_SHIFT);
+#else
+        l3e += l3_table_offset(addr);
+#endif
+pod_retry_l3:
+        if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
+        {
+            if ( p2m_flags_to_type(l3e_get_flags(*l3e)) == 
p2m_populate_on_demand )
+            {
+                if ( q != p2m_query )
+                {
+                    if ( !p2m_pod_demand_populate(p2m, gfn, 18, q) )
+                        goto pod_retry_l3;
+                }
+                else
+                    *t = p2m_populate_on_demand;
+            }
+            unmap_domain_page(l3e);
+            return _mfn(INVALID_MFN);
+        }
+        else if ( (l3e_get_flags(*l3e) & _PAGE_PSE) )
+        {
+            mfn = _mfn(l3e_get_pfn(*l3e) +
+                       l2_table_offset(addr) * L1_PAGETABLE_ENTRIES +
+                       l1_table_offset(addr));
+            *t = p2m_flags_to_type(l3e_get_flags(*l3e));
+            unmap_domain_page(l3e);
+
+            ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
+            return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
+        }
+
+        mfn = _mfn(l3e_get_pfn(*l3e));
+        unmap_domain_page(l3e);
+    }
+
+    l2e = map_domain_page(mfn_x(mfn));
+    l2e += l2_table_offset(addr);
+
+pod_retry_l2:
+    if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
+    {
+        /* PoD: Try to populate a 2-meg chunk */
+        if ( p2m_flags_to_type(l2e_get_flags(*l2e)) == p2m_populate_on_demand )
+        {
+            if ( q != p2m_query ) {
+                if ( !p2m_pod_check_and_populate(p2m, gfn,
+                                                 (l1_pgentry_t *)l2e, 9, q) )
+                    goto pod_retry_l2;
+            } else
+                *t = p2m_populate_on_demand;
+        }
+    
+        unmap_domain_page(l2e);
+        return _mfn(INVALID_MFN);
+    }
+    else if ( (l2e_get_flags(*l2e) & _PAGE_PSE) )
+    {
+        mfn = _mfn(l2e_get_pfn(*l2e) + l1_table_offset(addr));
+        *t = p2m_flags_to_type(l2e_get_flags(*l2e));
+        unmap_domain_page(l2e);
+        
+        ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
+        return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
+    }
+
+    mfn = _mfn(l2e_get_pfn(*l2e));
+    unmap_domain_page(l2e);
+
+    l1e = map_domain_page(mfn_x(mfn));
+    l1e += l1_table_offset(addr);
+pod_retry_l1:
+    if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
+    {
+        /* PoD: Try to populate */
+        if ( p2m_flags_to_type(l1e_get_flags(*l1e)) == p2m_populate_on_demand )
+        {
+            if ( q != p2m_query ) {
+                if ( !p2m_pod_check_and_populate(p2m, gfn,
+                                                 (l1_pgentry_t *)l1e, 0, q) )
+                    goto pod_retry_l1;
+            } else
+                *t = p2m_populate_on_demand;
+        }
+    
+        unmap_domain_page(l1e);
+        return _mfn(INVALID_MFN);
+    }
+    mfn = _mfn(l1e_get_pfn(*l1e));
+    *t = p2m_flags_to_type(l1e_get_flags(*l1e));
+    unmap_domain_page(l1e);
+
+    ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
+    return (p2m_is_valid(*t) || p2m_is_grant(*t)) ? mfn : _mfn(INVALID_MFN);
+}
+
+/* Read the current domain's p2m table (through the linear mapping). */
+static mfn_t p2m_gfn_to_mfn_current(struct p2m_domain *p2m,
+                                    unsigned long gfn, p2m_type_t *t, 
p2m_access_t *a,
+                                    p2m_query_t q)
+{
+    mfn_t mfn = _mfn(INVALID_MFN);
+    p2m_type_t p2mt = p2m_mmio_dm;
+    paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
+    /* XXX This is for compatibility with the old model, where anything not 
+     * XXX marked as RAM was considered to be emulated MMIO space.
+     * XXX Once we start explicitly registering MMIO regions in the p2m 
+     * XXX we will return p2m_invalid for unmapped gfns */
+
+    /* Not currently implemented except for EPT */
+    *a = p2m_access_rwx;
+
+    if ( gfn <= p2m->max_mapped_pfn )
+    {
+        l1_pgentry_t l1e = l1e_empty(), *p2m_entry;
+        l2_pgentry_t l2e = l2e_empty();
+        int ret;
+#if CONFIG_PAGING_LEVELS >= 4
+        l3_pgentry_t l3e = l3e_empty();
+#endif
+
+        ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START) 
+               / sizeof(l1_pgentry_t));
+
+#if CONFIG_PAGING_LEVELS >= 4
+        /*
+         * Read & process L3
+         */
+        p2m_entry = (l1_pgentry_t *)
+            &__linear_l2_table[l2_linear_offset(RO_MPT_VIRT_START)
+                               + l3_linear_offset(addr)];
+    pod_retry_l3:
+        ret = __copy_from_user(&l3e, p2m_entry, sizeof(l3e));
+
+        if ( ret != 0 || !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
+        {
+            if ( (l3e_get_flags(l3e) & _PAGE_PSE) &&
+                 (p2m_flags_to_type(l3e_get_flags(l3e)) == 
p2m_populate_on_demand) )
+            {
+                /* The read has succeeded, so we know that mapping exists */
+                if ( q != p2m_query )
+                {
+                    if ( !p2m_pod_demand_populate(p2m, gfn, 18, q) )
+                        goto pod_retry_l3;
+                    p2mt = p2m_invalid;
+                    printk("%s: Allocate 1GB failed!\n", __func__);
+                    goto out;
+                }
+                else
+                {
+                    p2mt = p2m_populate_on_demand;
+                    goto out;
+                }
+            }
+            goto pod_retry_l2;
+        }
+
+        if ( l3e_get_flags(l3e) & _PAGE_PSE )
+        {
+            p2mt = p2m_flags_to_type(l3e_get_flags(l3e));
+            ASSERT(l3e_get_pfn(l3e) != INVALID_MFN || !p2m_is_ram(p2mt));
+            if (p2m_is_valid(p2mt) )
+                mfn = _mfn(l3e_get_pfn(l3e) + 
+                           l2_table_offset(addr) * L1_PAGETABLE_ENTRIES + 
+                           l1_table_offset(addr));
+            else
+                p2mt = p2m_mmio_dm;
+            
+            goto out;
+        }
+#endif
+        /*
+         * Read & process L2
+         */
+        p2m_entry = &__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START)
+                                       + l2_linear_offset(addr)];
+
+    pod_retry_l2:
+        ret = __copy_from_user(&l2e,
+                               p2m_entry,
+                               sizeof(l2e));
+        if ( ret != 0
+             || !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
+        {
+            if( (l2e_get_flags(l2e) & _PAGE_PSE)
+                && ( p2m_flags_to_type(l2e_get_flags(l2e))
+                     == p2m_populate_on_demand ) )
+            {
+                /* The read has succeeded, so we know that the mapping
+                 * exits at this point.  */
+                if ( q != p2m_query )
+                {
+                    if ( !p2m_pod_check_and_populate(p2m, gfn,
+                                                     p2m_entry, 9, q) )
+                        goto pod_retry_l2;
+
+                    /* Allocate failed. */
+                    p2mt = p2m_invalid;
+                    printk("%s: Allocate failed!\n", __func__);
+                    goto out;
+                }
+                else
+                {
+                    p2mt = p2m_populate_on_demand;
+                    goto out;
+                }
+            }
+
+            goto pod_retry_l1;
+        }
+        
+        if (l2e_get_flags(l2e) & _PAGE_PSE)
+        {
+            p2mt = p2m_flags_to_type(l2e_get_flags(l2e));
+            ASSERT(l2e_get_pfn(l2e) != INVALID_MFN || !p2m_is_ram(p2mt));
+
+            if ( p2m_is_valid(p2mt) )
+                mfn = _mfn(l2e_get_pfn(l2e) + l1_table_offset(addr));
+            else
+                p2mt = p2m_mmio_dm;
+
+            goto out;
+        }
+
+        /*
+         * Read and process L1
+         */
+
+        /* Need to __copy_from_user because the p2m is sparse and this
+         * part might not exist */
+    pod_retry_l1:
+        p2m_entry = &phys_to_machine_mapping[gfn];
+
+        ret = __copy_from_user(&l1e,
+                               p2m_entry,
+                               sizeof(l1e));
+            
+        if ( ret == 0 ) {
+            p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
+            ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
+
+            if ( p2m_flags_to_type(l1e_get_flags(l1e))
+                 == p2m_populate_on_demand )
+            {
+                /* The read has succeeded, so we know that the mapping
+                 * exits at this point.  */
+                if ( q != p2m_query )
+                {
+                    if ( !p2m_pod_check_and_populate(p2m, gfn,
+                                                     (l1_pgentry_t 
*)p2m_entry, 0, q) )
+                        goto pod_retry_l1;
+
+                    /* Allocate failed. */
+                    p2mt = p2m_invalid;
+                    goto out;
+                }
+                else
+                {
+                    p2mt = p2m_populate_on_demand;
+                    goto out;
+                }
+            }
+
+            if ( p2m_is_valid(p2mt) || p2m_is_grant(p2mt) )
+                mfn = _mfn(l1e_get_pfn(l1e));
+            else 
+                /* XXX see above */
+                p2mt = p2m_mmio_dm;
+        }
+    }
+out:
+    *t = p2mt;
+    return mfn;
+}
+
+/* Walk the whole p2m table, changing any entries of the old type
+ * to the new type.  This is used in hardware-assisted paging to 
+ * quickly enable or diable log-dirty tracking */
+void p2m_change_type_global(struct p2m_domain *p2m, p2m_type_t ot, p2m_type_t 
nt)
+{
+    unsigned long mfn, gfn, flags;
+    l1_pgentry_t l1e_content;
+    l1_pgentry_t *l1e;
+    l2_pgentry_t *l2e;
+    mfn_t l1mfn, l2mfn, l3mfn;
+    unsigned long i1, i2, i3;
+    l3_pgentry_t *l3e;
+#if CONFIG_PAGING_LEVELS == 4
+    l4_pgentry_t *l4e;
+    unsigned long i4;
+#endif /* CONFIG_PAGING_LEVELS == 4 */
+
+    BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
+    BUG_ON(ot != nt && (ot == p2m_mmio_direct || nt == p2m_mmio_direct));
+
+    if ( !paging_mode_translate(p2m->domain) )
+        return;
+
+    if ( pagetable_get_pfn(p2m_get_pagetable(p2m)) == 0 )
+        return;
+
+    ASSERT(p2m_locked_by_me(p2m));
+
+#if CONFIG_PAGING_LEVELS == 4
+    l4e = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+#else /* CONFIG_PAGING_LEVELS == 3 */
+    l3mfn = _mfn(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+    l3e = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 4
+    for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
+    {
+        if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
+        {
+            continue;
+        }
+        l3mfn = _mfn(l4e_get_pfn(l4e[i4]));
+        l3e = map_domain_page(l4e_get_pfn(l4e[i4]));
+#endif
+        for ( i3 = 0;
+              i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8);
+              i3++ )
+        {
+            if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
+            {
+                continue;
+            }
+            if ( (l3e_get_flags(l3e[i3]) & _PAGE_PSE) )
+            {
+                flags = l3e_get_flags(l3e[i3]);
+                if ( p2m_flags_to_type(flags) != ot )
+                    continue;
+                mfn = l3e_get_pfn(l3e[i3]);
+                gfn = get_gpfn_from_mfn(mfn);
+                flags = p2m_type_to_flags(nt, _mfn(mfn));
+                l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
+                p2m->write_p2m_entry(p2m, gfn,
+                                     (l1_pgentry_t *)&l3e[i3],
+                                     l3mfn, l1e_content, 3);
+                continue;
+            }
+
+            l2mfn = _mfn(l3e_get_pfn(l3e[i3]));
+            l2e = map_domain_page(l3e_get_pfn(l3e[i3]));
+            for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
+            {
+                if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
+                {
+                    continue;
+                }
+
+                if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE) )
+                {
+                    flags = l2e_get_flags(l2e[i2]);
+                    if ( p2m_flags_to_type(flags) != ot )
+                        continue;
+                    mfn = l2e_get_pfn(l2e[i2]);
+                    /* Do not use get_gpfn_from_mfn because it may return 
+                       SHARED_M2P_ENTRY */
+                    gfn = (i2 + (i3
+#if CONFIG_PAGING_LEVELS >= 4
+                                  + (i4 * L3_PAGETABLE_ENTRIES)
+#endif
+                               )
+                           * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; 
+                    flags = p2m_type_to_flags(nt, _mfn(mfn));
+                    l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
+                    p2m->write_p2m_entry(p2m, gfn,
+                                         (l1_pgentry_t *)&l2e[i2],
+                                         l2mfn, l1e_content, 2);
+                    continue;
+                }
+
+                l1mfn = _mfn(l2e_get_pfn(l2e[i2]));
+                l1e = map_domain_page(mfn_x(l1mfn));
+
+                for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
+                {
+                    flags = l1e_get_flags(l1e[i1]);
+                    if ( p2m_flags_to_type(flags) != ot )
+                        continue;
+                    mfn = l1e_get_pfn(l1e[i1]);
+                    gfn = i1 + (i2 + (i3
+#if CONFIG_PAGING_LEVELS >= 4
+                                       + (i4 * L3_PAGETABLE_ENTRIES)
+#endif
+                                    )
+                           * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; 
+                    /* create a new 1le entry with the new type */
+                    flags = p2m_type_to_flags(nt, _mfn(mfn));
+                    l1e_content = l1e_from_pfn(mfn, flags);
+                    p2m->write_p2m_entry(p2m, gfn, &l1e[i1],
+                                         l1mfn, l1e_content, 1);
+                }
+                unmap_domain_page(l1e);
+            }
+            unmap_domain_page(l2e);
+        }
+#if CONFIG_PAGING_LEVELS >= 4
+        unmap_domain_page(l3e);
+    }
+#endif
+
+#if CONFIG_PAGING_LEVELS == 4
+    unmap_domain_page(l4e);
+#else /* CONFIG_PAGING_LEVELS == 3 */
+    unmap_domain_page(l3e);
+#endif
+
+}
+
+/* Set up the p2m function pointers for pagetable format */
+void p2m_pt_init(struct p2m_domain *p2m)
+{
+    p2m->set_entry = p2m_set_entry;
+    p2m->get_entry = p2m_gfn_to_mfn;
+    p2m->get_entry_current = p2m_gfn_to_mfn_current;
+    p2m->change_entry_type_global = p2m_change_type_global;
+    p2m->write_p2m_entry = paging_write_p2m_entry;
+}
+
+
+#if P2M_AUDIT
+/* strict_m2p == 0 allows m2p mappings that don'#t match the p2m. 
+ * It's intended for add_to_physmap, when the domain has just been allocated 
+ * new mfns that might have stale m2p entries from previous owners */
+void audit_p2m(struct p2m_domain *p2m, int strict_m2p)
+{
+    struct page_info *page;
+    struct domain *od;
+    unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
+    int entry_count = 0;
+    mfn_t p2mfn;
+    unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
+    int test_linear;
+    p2m_type_t type;
+    struct domain *d = p2m->domain;
+
+    if ( !paging_mode_translate(d) )
+        return;
+
+    //P2M_PRINTK("p2m audit starts\n");
+
+    test_linear = ( (d == current->domain)
+                    && !pagetable_is_null(current->arch.monitor_table) );
+    if ( test_linear )
+        flush_tlb_local();
+
+    spin_lock(&d->page_alloc_lock);
+
+    /* Audit part one: walk the domain's page allocation list, checking
+     * the m2p entries. */
+    page_list_for_each ( page, &d->page_list )
+    {
+        mfn = mfn_x(page_to_mfn(page));
+
+        // P2M_PRINTK("auditing guest page, mfn=%#lx\n", mfn);
+
+        od = page_get_owner(page);
+
+        if ( od != d )
+        {
+            P2M_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n",
+                       mfn, od, (od?od->domain_id:-1), d, d->domain_id);
+            continue;
+        }
+
+        gfn = get_gpfn_from_mfn(mfn);
+        if ( gfn == INVALID_M2P_ENTRY )
+        {
+            orphans_i++;
+            //P2M_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n",
+            //               mfn);
+            continue;
+        }
+
+        if ( gfn == 0x55555555 || gfn == 0x5555555555555555 )
+        {
+            orphans_d++;
+            //P2M_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n",
+            //               mfn);
+            continue;
+        }
+
+        if ( gfn == SHARED_M2P_ENTRY )
+        {
+            P2M_PRINTK("shared mfn (%lx) on domain page list!\n",
+                    mfn);
+            continue;
+        }
+
+        p2mfn = gfn_to_mfn_type_p2m(p2m, gfn, &type, p2m_query);
+        if ( strict_m2p && mfn_x(p2mfn) != mfn )
+        {
+            mpbad++;
+            P2M_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx"
+                       " (-> gfn %#lx)\n",
+                       mfn, gfn, mfn_x(p2mfn),
+                       (mfn_valid(p2mfn)
+                        ? get_gpfn_from_mfn(mfn_x(p2mfn))
+                        : -1u));
+            /* This m2p entry is stale: the domain has another frame in
+             * this physical slot.  No great disaster, but for neatness,
+             * blow away the m2p entry. */
+            set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
+        }
+
+        if ( test_linear && (gfn <= p2m->max_mapped_pfn) )
+        {
+            lp2mfn = mfn_x(gfn_to_mfn_query(p2m, gfn, &type));
+            if ( lp2mfn != mfn_x(p2mfn) )
+            {
+                P2M_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
+                           "(!= mfn %#lx)\n", gfn, lp2mfn, mfn_x(p2mfn));
+            }
+        }
+
+        // P2M_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n",
+        //                mfn, gfn, mfn_x(p2mfn), lp2mfn);
+    }
+
+    spin_unlock(&d->page_alloc_lock);
+
+    /* Audit part two: walk the domain's p2m table, checking the entries. */
+    if ( pagetable_get_pfn(p2m_get_pagetable(p2m)) != 0 )
+    {
+        l2_pgentry_t *l2e;
+        l1_pgentry_t *l1e;
+        int i1, i2;
+
+#if CONFIG_PAGING_LEVELS == 4
+        l4_pgentry_t *l4e;
+        l3_pgentry_t *l3e;
+        int i4, i3;
+        l4e = 
map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+#else /* CONFIG_PAGING_LEVELS == 3 */
+        l3_pgentry_t *l3e;
+        int i3;
+        l3e = 
map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+#endif
+
+        gfn = 0;
+#if CONFIG_PAGING_LEVELS >= 4
+        for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
+        {
+            if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
+            {
+                gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
+                continue;
+            }
+            l3e = map_domain_page(mfn_x(_mfn(l4e_get_pfn(l4e[i4]))));
+#endif
+            for ( i3 = 0;
+                  i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8);
+                  i3++ )
+            {
+                if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
+                {
+                    gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
+                    continue;
+                }
+
+                /* check for 1GB super page */
+                if ( l3e_get_flags(l3e[i3]) & _PAGE_PSE )
+                {
+                    mfn = l3e_get_pfn(l3e[i3]);
+                    ASSERT(mfn_valid(_mfn(mfn)));
+                    /* we have to cover 512x512 4K pages */
+                    for ( i2 = 0; 
+                          i2 < (L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES);
+                          i2++)
+                    {
+                        m2pfn = get_gpfn_from_mfn(mfn+i2);
+                        if ( m2pfn != (gfn + i2) )
+                        {
+                            pmbad++;
+                            P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
+                                       " -> gfn %#lx\n", gfn+i2, mfn+i2,
+                                       m2pfn);
+                            BUG();
+                        }
+                        gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
+                        continue;
+                    }
+                }
+
+                l2e = map_domain_page(mfn_x(_mfn(l3e_get_pfn(l3e[i3]))));
+                for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
+                {
+                    if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
+                    {
+                        if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE)
+                             && ( p2m_flags_to_type(l2e_get_flags(l2e[i2]))
+                                  == p2m_populate_on_demand ) )
+                            entry_count+=SUPERPAGE_PAGES;
+                        gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
+                        continue;
+                    }
+                    
+                    /* check for super page */
+                    if ( l2e_get_flags(l2e[i2]) & _PAGE_PSE )
+                    {
+                        mfn = l2e_get_pfn(l2e[i2]);
+                        ASSERT(mfn_valid(_mfn(mfn)));
+                        for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++)
+                        {
+                            m2pfn = get_gpfn_from_mfn(mfn+i1);
+                            /* Allow shared M2Ps */
+                            if ( (m2pfn != (gfn + i1)) &&
+                                 (m2pfn != SHARED_M2P_ENTRY) )
+                            {
+                                pmbad++;
+                                P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
+                                           " -> gfn %#lx\n", gfn+i1, mfn+i1,
+                                           m2pfn);
+                                BUG();
+                            }
+                        }
+                        gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
+                        continue;
+                    }
+
+                    l1e = map_domain_page(mfn_x(_mfn(l2e_get_pfn(l2e[i2]))));
+
+                    for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
+                    {
+                        p2m_type_t type;
+
+                        type = p2m_flags_to_type(l1e_get_flags(l1e[i1]));
+                        if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
+                        {
+                            if ( type == p2m_populate_on_demand )
+                                entry_count++;
+                            continue;
+                        }
+                        mfn = l1e_get_pfn(l1e[i1]);
+                        ASSERT(mfn_valid(_mfn(mfn)));
+                        m2pfn = get_gpfn_from_mfn(mfn);
+                        if ( m2pfn != gfn &&
+                             type != p2m_mmio_direct &&
+                             !p2m_is_grant(type) &&
+                             !p2m_is_shared(type) )
+                        {
+                            pmbad++;
+                            printk("mismatch: gfn %#lx -> mfn %#lx"
+                                   " -> gfn %#lx\n", gfn, mfn, m2pfn);
+                            P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
+                                       " -> gfn %#lx\n", gfn, mfn, m2pfn);
+                            BUG();
+                        }
+                    }
+                    unmap_domain_page(l1e);
+                }
+                unmap_domain_page(l2e);
+            }
+#if CONFIG_PAGING_LEVELS >= 4
+            unmap_domain_page(l3e);
+        }
+#endif
+
+#if CONFIG_PAGING_LEVELS == 4
+        unmap_domain_page(l4e);
+#else /* CONFIG_PAGING_LEVELS == 3 */
+        unmap_domain_page(l3e);
+#endif
+
+    }
+
+    if ( entry_count != p2m->pod.entry_count )
+    {
+        printk("%s: refcounted entry count %d, audit count %d!\n",
+               __func__,
+               p2m->pod.entry_count,
+               entry_count);
+        BUG();
+    }
+        
+    //P2M_PRINTK("p2m audit complete\n");
+    //if ( orphans_i | orphans_d | mpbad | pmbad )
+    //    P2M_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
+    //                   orphans_i + orphans_d, orphans_i, orphans_d);
+    if ( mpbad | pmbad )
+    {
+        P2M_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
+                   pmbad, mpbad);
+        WARN();
+    }
+}
+#endif /* P2M_AUDIT */
+
diff -r 4b0692880dfa -r 26c4beb6b520 xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c     Thu May 05 17:40:34 2011 +0100
+++ b/xen/arch/x86/mm/p2m.c     Fri May 06 11:15:35 2011 +0100
@@ -37,10 +37,6 @@
 #include <asm/hvm/nestedhvm.h>
 #include <asm/hvm/svm/amd-iommu-proto.h>
 
-/* Debugging and auditing of the P2M code? */
-#define P2M_AUDIT     0
-#define P2M_DEBUGGING 0
-
 /* turn on/off 1GB host page table support for hap, default on */
 static bool_t __read_mostly opt_hap_1gb = 1;
 boolean_param("hap_1gb", opt_hap_1gb);
@@ -69,1853 +65,14 @@ boolean_param("hap_2mb", opt_hap_2mb);
 #undef page_to_mfn
 #define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
 
-
-/* PTE flags for the various types of p2m entry */
-#define P2M_BASE_FLAGS \
-        (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED)
-
-#define SUPERPAGE_PAGES (1UL << 9)
-#define superpage_aligned(_x)  (((_x)&(SUPERPAGE_PAGES-1))==0)
-
-unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
-{
-    unsigned long flags;
-#ifdef __x86_64__
-    /*
-     * AMD IOMMU: When we share p2m table with iommu, bit 9 - bit 11 will be
-     * used for iommu hardware to encode next io page level. Bit 59 - bit 62
-     * are used for iommu flags, We could not use these bits to store p2m 
types.
-     */
-    flags = (unsigned long)(t & 0x7f) << 12;
-#else
-    flags = (t & 0x7UL) << 9;
-#endif
-#ifndef HAVE_GRANT_MAP_P2M
-    BUG_ON(p2m_is_grant(t));
-#endif
-    switch(t)
-    {
-    case p2m_invalid:
-    default:
-        return flags;
-    case p2m_ram_rw:
-    case p2m_grant_map_rw:
-        return flags | P2M_BASE_FLAGS | _PAGE_RW;
-    case p2m_ram_logdirty:
-        return flags | P2M_BASE_FLAGS;
-    case p2m_ram_ro:
-    case p2m_grant_map_ro:
-        return flags | P2M_BASE_FLAGS;
-    case p2m_ram_shared:
-        return flags | P2M_BASE_FLAGS;
-    case p2m_mmio_dm:
-        return flags;
-    case p2m_mmio_direct:
-        if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) )
-            flags |= _PAGE_RW;
-        return flags | P2M_BASE_FLAGS | _PAGE_PCD;
-    case p2m_populate_on_demand:
-        return flags;
-    }
-}
-
 #if P2M_AUDIT
-static void audit_p2m(struct p2m_domain *p2m, int strict_m2p);
+extern void audit_p2m(struct p2m_domain *p2m, int strict_m2p);
 #else
 # define audit_p2m(_p2m, _m2p) do { (void)(_p2m),(_m2p); } while (0)
 #endif /* P2M_AUDIT */
 
-// Find the next level's P2M entry, checking for out-of-range gfn's...
-// Returns NULL on error.
-//
-l1_pgentry_t *
-p2m_find_entry(void *table, unsigned long *gfn_remainder,
-                   unsigned long gfn, uint32_t shift, uint32_t max)
-{
-    u32 index;
-
-    index = *gfn_remainder >> shift;
-    if ( index >= max )
-    {
-        P2M_DEBUG("gfn=0x%lx out of range "
-                  "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
-                  gfn, *gfn_remainder, shift, index, max);
-        return NULL;
-    }
-    *gfn_remainder &= (1 << shift) - 1;
-    return (l1_pgentry_t *)table + index;
-}
-
-struct page_info *
-p2m_alloc_ptp(struct p2m_domain *p2m, unsigned long type)
-{
-    struct page_info *pg;
-
-    ASSERT(p2m);
-    ASSERT(p2m->domain);
-    ASSERT(p2m->domain->arch.paging.alloc_page);
-    pg = p2m->domain->arch.paging.alloc_page(p2m->domain);
-    if (pg == NULL)
-        return NULL;
-
-    page_list_add_tail(pg, &p2m->pages);
-    pg->u.inuse.type_info = type | 1 | PGT_validated;
-
-    return pg;
-}
-
-void
-p2m_free_ptp(struct p2m_domain *p2m, struct page_info *pg)
-{
-    ASSERT(pg);
-    ASSERT(p2m);
-    ASSERT(p2m->domain);
-    ASSERT(p2m->domain->arch.paging.free_page);
-
-    page_list_del(pg, &p2m->pages);
-    p2m->domain->arch.paging.free_page(p2m->domain, pg);
-
-    return;
-}
-
-/* Free intermediate tables from a p2m sub-tree */
-void
-p2m_free_entry(struct p2m_domain *p2m, l1_pgentry_t *p2m_entry, int page_order)
-{
-    /* End if the entry is a leaf entry. */
-    if ( page_order == 0
-         || !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT)
-         || (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
-        return;
-
-    if ( page_order > 9 )
-    {
-        l1_pgentry_t *l3_table = map_domain_page(l1e_get_pfn(*p2m_entry));
-        for ( int i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
-            p2m_free_entry(p2m, l3_table + i, page_order - 9);
-        unmap_domain_page(l3_table);
-    }
-
-    p2m_free_ptp(p2m, mfn_to_page(_mfn(l1e_get_pfn(*p2m_entry))));
-}
-
-// Walk one level of the P2M table, allocating a new table if required.
-// Returns 0 on error.
-//
-
-/* AMD IOMMU: Convert next level bits and r/w bits into 24 bits p2m flags */
-#define iommu_nlevel_to_flags(nl, f) ((((nl) & 0x7) << 9 )|(((f) & 0x3) << 21))
-
-static void p2m_add_iommu_flags(l1_pgentry_t *p2m_entry,
-                                unsigned int nlevel, unsigned int flags)
-{
-#if CONFIG_PAGING_LEVELS == 4
-    if ( iommu_hap_pt_share )
-        l1e_add_flags(*p2m_entry, iommu_nlevel_to_flags(nlevel, flags));
-#endif
-}
-
-static int
-p2m_next_level(struct p2m_domain *p2m, mfn_t *table_mfn, void **table,
-               unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
-               u32 max, unsigned long type)
-{
-    l1_pgentry_t *l1_entry;
-    l1_pgentry_t *p2m_entry;
-    l1_pgentry_t new_entry;
-    void *next;
-    int i;
-
-    if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
-                                      shift, max)) )
-        return 0;
-
-    /* PoD: Not present doesn't imply empty. */
-    if ( !l1e_get_flags(*p2m_entry) )
-    {
-        struct page_info *pg;
-
-        pg = p2m_alloc_ptp(p2m, type);
-        if ( pg == NULL )
-            return 0;
-
-        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
-                                 __PAGE_HYPERVISOR | _PAGE_USER);
-
-        switch ( type ) {
-        case PGT_l3_page_table:
-            p2m_add_iommu_flags(&new_entry, 3, 
IOMMUF_readable|IOMMUF_writable);
-            p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 
4);
-            break;
-        case PGT_l2_page_table:
-#if CONFIG_PAGING_LEVELS == 3
-            /* for PAE mode, PDPE only has PCD/PWT/P bits available */
-            new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), _PAGE_PRESENT);
-#endif
-            p2m_add_iommu_flags(&new_entry, 2, 
IOMMUF_readable|IOMMUF_writable);
-            p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 
3);
-            break;
-        case PGT_l1_page_table:
-            p2m_add_iommu_flags(&new_entry, 1, 
IOMMUF_readable|IOMMUF_writable);
-            p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 
2);
-            break;
-        default:
-            BUG();
-            break;
-        }
-    }
-
-    ASSERT(l1e_get_flags(*p2m_entry) & (_PAGE_PRESENT|_PAGE_PSE));
-
-    /* split 1GB pages into 2MB pages */
-    if ( type == PGT_l2_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
-    {
-        unsigned long flags, pfn;
-        struct page_info *pg;
-
-        pg = p2m_alloc_ptp(p2m, PGT_l2_page_table);
-        if ( pg == NULL )
-            return 0;
-
-        flags = l1e_get_flags(*p2m_entry);
-        pfn = l1e_get_pfn(*p2m_entry);
-
-        l1_entry = map_domain_page(mfn_x(page_to_mfn(pg)));
-        for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
-        {
-            new_entry = l1e_from_pfn(pfn + (i * L1_PAGETABLE_ENTRIES), flags);
-            p2m_add_iommu_flags(&new_entry, 1, 
IOMMUF_readable|IOMMUF_writable);
-            p2m->write_p2m_entry(p2m, gfn,
-                l1_entry+i, *table_mfn, new_entry, 2);
-        }
-        unmap_domain_page(l1_entry);
-        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
-                                 __PAGE_HYPERVISOR|_PAGE_USER); //disable PSE
-        p2m_add_iommu_flags(&new_entry, 2, IOMMUF_readable|IOMMUF_writable);
-        p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3);
-    }
-
-
-    /* split single 2MB large page into 4KB page in P2M table */
-    if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
-    {
-        unsigned long flags, pfn;
-        struct page_info *pg;
-
-        pg = p2m_alloc_ptp(p2m, PGT_l1_page_table);
-        if ( pg == NULL )
-            return 0;
-
-        /* New splintered mappings inherit the flags of the old superpage, 
-         * with a little reorganisation for the _PAGE_PSE_PAT bit. */
-        flags = l1e_get_flags(*p2m_entry);
-        pfn = l1e_get_pfn(*p2m_entry);
-        if ( pfn & 1 )           /* ==> _PAGE_PSE_PAT was set */
-            pfn -= 1;            /* Clear it; _PAGE_PSE becomes _PAGE_PAT */
-        else
-            flags &= ~_PAGE_PSE; /* Clear _PAGE_PSE (== _PAGE_PAT) */
-        
-        l1_entry = __map_domain_page(pg);
-        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
-        {
-            new_entry = l1e_from_pfn(pfn + i, flags);
-            p2m_add_iommu_flags(&new_entry, 0, 0);
-            p2m->write_p2m_entry(p2m, gfn,
-                l1_entry+i, *table_mfn, new_entry, 1);
-        }
-        unmap_domain_page(l1_entry);
-        
-        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
-                                 __PAGE_HYPERVISOR|_PAGE_USER);
-        p2m_add_iommu_flags(&new_entry, 1, IOMMUF_readable|IOMMUF_writable);
-        p2m->write_p2m_entry(p2m, gfn,
-            p2m_entry, *table_mfn, new_entry, 2);
-    }
-
-    *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
-    next = map_domain_page(mfn_x(*table_mfn));
-    unmap_domain_page(*table);
-    *table = next;
-
-    return 1;
-}
-
-/*
- * Populate-on-demand functionality
- */
-static
-int set_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, 
-                  unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma);
-
-static int
-p2m_pod_cache_add(struct p2m_domain *p2m,
-                  struct page_info *page,
-                  unsigned long order)
-{
-    int i;
-    struct page_info *p;
-    struct domain *d = p2m->domain;
-
-#ifndef NDEBUG
-    mfn_t mfn;
-
-    mfn = page_to_mfn(page);
-
-    /* Check to make sure this is a contiguous region */
-    if( mfn_x(mfn) & ((1 << order) - 1) )
-    {
-        printk("%s: mfn %lx not aligned order %lu! (mask %lx)\n",
-               __func__, mfn_x(mfn), order, ((1UL << order) - 1));
-        return -1;
-    }
-    
-    for(i=0; i < 1 << order ; i++) {
-        struct domain * od;
-
-        p = mfn_to_page(_mfn(mfn_x(mfn) + i));
-        od = page_get_owner(p);
-        if(od != d)
-        {
-            printk("%s: mfn %lx expected owner d%d, got owner d%d!\n",
-                   __func__, mfn_x(mfn), d->domain_id,
-                   od?od->domain_id:-1);
-            return -1;
-        }
-    }
-#endif
-
-    ASSERT(p2m_locked_by_me(p2m));
-
-    /*
-     * Pages from domain_alloc and returned by the balloon driver aren't
-     * guaranteed to be zero; but by reclaiming zero pages, we implicitly
-     * promise to provide zero pages. So we scrub pages before using.
-     */
-    for ( i = 0; i < (1 << order); i++ )
-    {
-        char *b = map_domain_page(mfn_x(page_to_mfn(page)) + i);
-        clear_page(b);
-        unmap_domain_page(b);
-    }
-
-    spin_lock(&d->page_alloc_lock);
-
-    /* First, take all pages off the domain list */
-    for(i=0; i < 1 << order ; i++)
-    {
-        p = page + i;
-        page_list_del(p, &d->page_list);
-    }
-
-    /* Then add the first one to the appropriate populate-on-demand list */
-    switch(order)
-    {
-    case 9:
-        page_list_add_tail(page, &p2m->pod.super); /* lock: page_alloc */
-        p2m->pod.count += 1 << order;
-        break;
-    case 0:
-        page_list_add_tail(page, &p2m->pod.single); /* lock: page_alloc */
-        p2m->pod.count += 1;
-        break;
-    default:
-        BUG();
-    }
-
-    /* Ensure that the PoD cache has never been emptied.  
-     * This may cause "zombie domains" since the page will never be freed. */
-    BUG_ON( d->arch.relmem != RELMEM_not_started );
-
-    spin_unlock(&d->page_alloc_lock);
-
-    return 0;
-}
-
-/* Get a page of size order from the populate-on-demand cache.  Will break
- * down 2-meg pages into singleton pages automatically.  Returns null if
- * a superpage is requested and no superpages are available.  Must be called
- * with the d->page_lock held. */
-static struct page_info * p2m_pod_cache_get(struct p2m_domain *p2m,
-                                            unsigned long order)
-{
-    struct page_info *p = NULL;
-    int i;
-
-    if ( order == 9 && page_list_empty(&p2m->pod.super) )
-    {
-        return NULL;
-    }
-    else if ( order == 0 && page_list_empty(&p2m->pod.single) )
-    {
-        unsigned long mfn;
-        struct page_info *q;
-
-        BUG_ON( page_list_empty(&p2m->pod.super) );
-
-        /* Break up a superpage to make single pages. NB count doesn't
-         * need to be adjusted. */
-        p = page_list_remove_head(&p2m->pod.super);
-        mfn = mfn_x(page_to_mfn(p));
-
-        for ( i=0; i<SUPERPAGE_PAGES; i++ )
-        {
-            q = mfn_to_page(_mfn(mfn+i));
-            page_list_add_tail(q, &p2m->pod.single);
-        }
-    }
-
-    switch ( order )
-    {
-    case 9:
-        BUG_ON( page_list_empty(&p2m->pod.super) );
-        p = page_list_remove_head(&p2m->pod.super);
-        p2m->pod.count -= 1 << order; /* Lock: page_alloc */
-        break;
-    case 0:
-        BUG_ON( page_list_empty(&p2m->pod.single) );
-        p = page_list_remove_head(&p2m->pod.single);
-        p2m->pod.count -= 1;
-        break;
-    default:
-        BUG();
-    }
-
-    /* Put the pages back on the domain page_list */
-    for ( i = 0 ; i < (1 << order); i++ )
-    {
-        BUG_ON(page_get_owner(p + i) != p2m->domain);
-        page_list_add_tail(p + i, &p2m->domain->page_list);
-    }
-
-    return p;
-}
-
-/* Set the size of the cache, allocating or freeing as necessary. */
-static int
-p2m_pod_set_cache_target(struct p2m_domain *p2m, unsigned long pod_target, int 
preemptible)
-{
-    struct domain *d = p2m->domain;
-    int ret = 0;
-
-    /* Increasing the target */
-    while ( pod_target > p2m->pod.count )
-    {
-        struct page_info * page;
-        int order;
-
-        if ( (pod_target - p2m->pod.count) >= SUPERPAGE_PAGES )
-            order = 9;
-        else
-            order = 0;
-    retry:
-        page = alloc_domheap_pages(d, order, 0);
-        if ( unlikely(page == NULL) )
-        {
-            if ( order == 9 )
-            {
-                /* If we can't allocate a superpage, try singleton pages */
-                order = 0;
-                goto retry;
-            }   
-            
-            printk("%s: Unable to allocate domheap page for pod cache.  target 
%lu cachesize %d\n",
-                   __func__, pod_target, p2m->pod.count);
-            ret = -ENOMEM;
-            goto out;
-        }
-
-        p2m_pod_cache_add(p2m, page, order);
-
-        if ( hypercall_preempt_check() && preemptible )
-        {
-            ret = -EAGAIN;
-            goto out;
-        }
-    }
-
-    /* Decreasing the target */
-    /* We hold the p2m lock here, so we don't need to worry about
-     * cache disappearing under our feet. */
-    while ( pod_target < p2m->pod.count )
-    {
-        struct page_info * page;
-        int order, i;
-
-        /* Grab the lock before checking that pod.super is empty, or the last
-         * entries may disappear before we grab the lock. */
-        spin_lock(&d->page_alloc_lock);
-
-        if ( (p2m->pod.count - pod_target) > SUPERPAGE_PAGES
-             && !page_list_empty(&p2m->pod.super) )
-            order = 9;
-        else
-            order = 0;
-
-        page = p2m_pod_cache_get(p2m, order);
-
-        ASSERT(page != NULL);
-
-        spin_unlock(&d->page_alloc_lock);
-
-        /* Then free them */
-        for ( i = 0 ; i < (1 << order) ; i++ )
-        {
-            /* Copied from common/memory.c:guest_remove_page() */
-            if ( unlikely(!get_page(page+i, d)) )
-            {
-                gdprintk(XENLOG_INFO, "Bad page free for domain %u\n", 
d->domain_id);
-                ret = -EINVAL;
-                goto out;
-            }
-
-            if ( test_and_clear_bit(_PGT_pinned, &(page+i)->u.inuse.type_info) 
)
-                put_page_and_type(page+i);
-            
-            if ( test_and_clear_bit(_PGC_allocated, &(page+i)->count_info) )
-                put_page(page+i);
-
-            put_page(page+i);
-
-            if ( hypercall_preempt_check() && preemptible )
-            {
-                ret = -EAGAIN;
-                goto out;
-            }
-        }
-    }
-
-out:
-    return ret;
-}
-
-/*
- * The "right behavior" here requires some careful thought.  First, some
- * definitions:
- * + M: static_max
- * + B: number of pages the balloon driver has ballooned down to.
- * + P: Number of populated pages. 
- * + T: Old target
- * + T': New target
- *
- * The following equations should hold:
- *  0 <= P <= T <= B <= M
- *  d->arch.p2m->pod.entry_count == B - P
- *  d->tot_pages == P + d->arch.p2m->pod.count
- *
- * Now we have the following potential cases to cover:
- *     B <T': Set the PoD cache size equal to the number of outstanding PoD
- *   entries.  The balloon driver will deflate the balloon to give back
- *   the remainder of the ram to the guest OS.
- *  T <T'<B : Increase PoD cache size.
- *  T'<T<=B : Here we have a choice.  We can decrease the size of the cache,
- *   get the memory right away.  However, that means every time we 
- *   reduce the memory target we risk the guest attempting to populate the 
- *   memory before the balloon driver has reached its new target.  Safer to
- *   never reduce the cache size here, but only when the balloon driver frees 
- *   PoD ranges.
- *
- * If there are many zero pages, we could reach the target also by doing
- * zero sweeps and marking the ranges PoD; but the balloon driver will have
- * to free this memory eventually anyway, so we don't actually gain that much
- * by doing so.
- *
- * NB that the equation (B<T') may require adjustment to the cache
- * size as PoD pages are freed as well; i.e., freeing a PoD-backed
- * entry when pod.entry_count == pod.count requires us to reduce both
- * pod.entry_count and pod.count.
- */
-int
-p2m_pod_set_mem_target(struct domain *d, unsigned long target)
-{
-    unsigned pod_target;
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-    int ret = 0;
-    unsigned long populated;
-
-    p2m_lock(p2m);
-
-    /* P == B: Nothing to do. */
-    if ( p2m->pod.entry_count == 0 )
-        goto out;
-
-    /* Don't do anything if the domain is being torn down */
-    if ( d->is_dying )
-        goto out;
-
-    /* T' < B: Don't reduce the cache size; let the balloon driver
-     * take care of it. */
-    if ( target < d->tot_pages )
-        goto out;
-
-    populated  = d->tot_pages - p2m->pod.count;
-
-    pod_target = target - populated;
-
-    /* B < T': Set the cache size equal to # of outstanding entries,
-     * let the balloon driver fill in the rest. */
-    if ( pod_target > p2m->pod.entry_count )
-        pod_target = p2m->pod.entry_count;
-
-    ASSERT( pod_target >= p2m->pod.count );
-
-    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
-
-out:
-    p2m_unlock(p2m);
-
-    return ret;
-}
-
-void
-p2m_pod_empty_cache(struct domain *d)
-{
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-    struct page_info *page;
-
-    /* After this barrier no new PoD activities can happen. */
-    BUG_ON(!d->is_dying);
-    spin_barrier(&p2m->lock);
-
-    spin_lock(&d->page_alloc_lock);
-
-    while ( (page = page_list_remove_head(&p2m->pod.super)) )
-    {
-        int i;
-            
-        for ( i = 0 ; i < SUPERPAGE_PAGES ; i++ )
-        {
-            BUG_ON(page_get_owner(page + i) != d);
-            page_list_add_tail(page + i, &d->page_list);
-        }
-
-        p2m->pod.count -= SUPERPAGE_PAGES;
-    }
-
-    while ( (page = page_list_remove_head(&p2m->pod.single)) )
-    {
-        BUG_ON(page_get_owner(page) != d);
-        page_list_add_tail(page, &d->page_list);
-
-        p2m->pod.count -= 1;
-    }
-
-    BUG_ON(p2m->pod.count != 0);
-
-    spin_unlock(&d->page_alloc_lock);
-}
-
-int
-p2m_pod_offline_or_broken_hit(struct page_info *p)
-{
-    struct domain *d;
-    struct p2m_domain *p2m;
-    struct page_info *q, *tmp;
-    unsigned long mfn, bmfn;
-
-    if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) )
-        return 0;
-
-    spin_lock(&d->page_alloc_lock);
-    bmfn = mfn_x(page_to_mfn(p));
-    page_list_for_each_safe(q, tmp, &p2m->pod.super)
-    {
-        mfn = mfn_x(page_to_mfn(q));
-        if ( (bmfn >= mfn) && ((bmfn - mfn) < SUPERPAGE_PAGES) )
-        {
-            unsigned long i;
-            page_list_del(q, &p2m->pod.super);
-            for ( i = 0; i < SUPERPAGE_PAGES; i++)
-            {
-                q = mfn_to_page(_mfn(mfn + i));
-                page_list_add_tail(q, &p2m->pod.single);
-            }
-            page_list_del(p, &p2m->pod.single);
-            p2m->pod.count--;
-            goto pod_hit;
-        }
-    }
-
-    page_list_for_each_safe(q, tmp, &p2m->pod.single)
-    {
-        mfn = mfn_x(page_to_mfn(q));
-        if ( mfn == bmfn )
-        {
-            page_list_del(p, &p2m->pod.single);
-            p2m->pod.count--;
-            goto pod_hit;
-        }
-    }
-
-    spin_unlock(&d->page_alloc_lock);
-    return 0;
-
-pod_hit:
-    page_list_add_tail(p, &d->arch.relmem_list);
-    spin_unlock(&d->page_alloc_lock);
-    return 1;
-}
-
-void
-p2m_pod_offline_or_broken_replace(struct page_info *p)
-{
-    struct domain *d;
-    struct p2m_domain *p2m;
-
-    if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) )
-        return;
-
-    free_domheap_page(p);
-
-    p = alloc_domheap_page(d, 0);
-    if ( unlikely(!p) )
-        return;
-
-    p2m_lock(p2m);
-    p2m_pod_cache_add(p2m, p, 0);
-    p2m_unlock(p2m);
-    return;
-}
-
-/* This function is needed for two reasons:
- * + To properly handle clearing of PoD entries
- * + To "steal back" memory being freed for the PoD cache, rather than
- *   releasing it.
- *
- * Once both of these functions have been completed, we can return and
- * allow decrease_reservation() to handle everything else.
- */
-int
-p2m_pod_decrease_reservation(struct domain *d,
-                             xen_pfn_t gpfn,
-                             unsigned int order)
-{
-    int ret=0;
-    int i;
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-
-    int steal_for_cache = 0;
-    int pod = 0, nonpod = 0, ram = 0;
-    
-
-    /* If we don't have any outstanding PoD entries, let things take their
-     * course */
-    if ( p2m->pod.entry_count == 0 )
-        goto out;
-
-    /* Figure out if we need to steal some freed memory for our cache */
-    steal_for_cache =  ( p2m->pod.entry_count > p2m->pod.count );
-
-    p2m_lock(p2m);
-    audit_p2m(p2m, 1);
-
-    if ( unlikely(d->is_dying) )
-        goto out_unlock;
-
-    /* See what's in here. */
-    /* FIXME: Add contiguous; query for PSE entries? */
-    for ( i=0; i<(1<<order); i++)
-    {
-        p2m_type_t t;
-
-        gfn_to_mfn_query(p2m, gpfn + i, &t);
-
-        if ( t == p2m_populate_on_demand )
-            pod++;
-        else
-        {
-            nonpod++;
-            if ( p2m_is_ram(t) )
-                ram++;
-        }
-    }
-
-    /* No populate-on-demand?  Don't need to steal anything?  Then we're 
done!*/
-    if(!pod && !steal_for_cache)
-        goto out_unlock;
-
-    if ( !nonpod )
-    {
-        /* All PoD: Mark the whole region invalid and tell caller
-         * we're done. */
-        set_p2m_entry(p2m, gpfn, _mfn(INVALID_MFN), order, p2m_invalid, 
p2m->default_access);
-        p2m->pod.entry_count-=(1<<order); /* Lock: p2m */
-        BUG_ON(p2m->pod.entry_count < 0);
-        ret = 1;
-        goto out_entry_check;
-    }
-
-    /* FIXME: Steal contig 2-meg regions for cache */
-
-    /* Process as long as:
-     * + There are PoD entries to handle, or
-     * + There is ram left, and we want to steal it
-     */
-    for ( i=0;
-          i<(1<<order) && (pod>0 || (steal_for_cache && ram > 0));
-          i++)
-    {
-        mfn_t mfn;
-        p2m_type_t t;
-
-        mfn = gfn_to_mfn_query(p2m, gpfn + i, &t);
-        if ( t == p2m_populate_on_demand )
-        {
-            set_p2m_entry(p2m, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid, 
p2m->default_access);
-            p2m->pod.entry_count--; /* Lock: p2m */
-            BUG_ON(p2m->pod.entry_count < 0);
-            pod--;
-        }
-        else if ( steal_for_cache && p2m_is_ram(t) )
-        {
-            struct page_info *page;
-
-            ASSERT(mfn_valid(mfn));
-
-            page = mfn_to_page(mfn);
-
-            set_p2m_entry(p2m, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid, 
p2m->default_access);
-            set_gpfn_from_mfn(mfn_x(mfn), INVALID_M2P_ENTRY);
-
-            p2m_pod_cache_add(p2m, page, 0);
-
-            steal_for_cache =  ( p2m->pod.entry_count > p2m->pod.count );
-
-            nonpod--;
-            ram--;
-        }
-    }    
-
-    /* If there are no more non-PoD entries, tell decrease_reservation() that
-     * there's nothing left to do. */
-    if ( nonpod == 0 )
-        ret = 1;
-
-out_entry_check:
-    /* If we've reduced our "liabilities" beyond our "assets", free some */
-    if ( p2m->pod.entry_count < p2m->pod.count )
-    {
-        p2m_pod_set_cache_target(p2m, p2m->pod.entry_count, 0/*can't 
preempt*/);
-    }
-
-out_unlock:
-    audit_p2m(p2m, 1);
-    p2m_unlock(p2m);
-
-out:
-    return ret;
-}
-
-void
-p2m_pod_dump_data(struct p2m_domain *p2m)
-{
-    printk("    PoD entries=%d cachesize=%d\n",
-           p2m->pod.entry_count, p2m->pod.count);
-}
-
-
-/* Search for all-zero superpages to be reclaimed as superpages for the
- * PoD cache. Must be called w/ p2m lock held, page_alloc lock not held. */
-static int
-p2m_pod_zero_check_superpage(struct p2m_domain *p2m, unsigned long gfn)
-{
-    mfn_t mfn, mfn0 = _mfn(INVALID_MFN);
-    p2m_type_t type, type0 = 0;
-    unsigned long * map = NULL;
-    int ret=0, reset = 0;
-    int i, j;
-    int max_ref = 1;
-    struct domain *d = p2m->domain;
-
-    if ( !superpage_aligned(gfn) )
-        goto out;
-
-    /* Allow an extra refcount for one shadow pt mapping in shadowed domains */
-    if ( paging_mode_shadow(d) )
-        max_ref++;
-
-    /* Look up the mfns, checking to make sure they're the same mfn
-     * and aligned, and mapping them. */
-    for ( i=0; i<SUPERPAGE_PAGES; i++ )
-    {
-        
-        mfn = gfn_to_mfn_query(p2m, gfn + i, &type);
-
-        if ( i == 0 )
-        {
-            mfn0 = mfn;
-            type0 = type;
-        }
-
-        /* Conditions that must be met for superpage-superpage:
-         * + All gfns are ram types
-         * + All gfns have the same type
-         * + All of the mfns are allocated to a domain
-         * + None of the mfns are used as pagetables, or allocated via xenheap
-         * + The first mfn is 2-meg aligned
-         * + All the other mfns are in sequence
-         * Adding for good measure:
-         * + None of the mfns are likely to be mapped elsewhere (refcount
-         *   2 or less for shadow, 1 for hap)
-         */
-        if ( !p2m_is_ram(type)
-             || type != type0
-             || ( (mfn_to_page(mfn)->count_info & PGC_allocated) == 0 )
-             || ( (mfn_to_page(mfn)->count_info & 
(PGC_page_table|PGC_xen_heap)) != 0 )
-             || ( (mfn_to_page(mfn)->count_info & PGC_xen_heap  ) != 0 )
-             || ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > max_ref )
-             || !( ( i == 0 && superpage_aligned(mfn_x(mfn0)) )
-                   || ( i != 0 && mfn_x(mfn) == (mfn_x(mfn0) + i) ) ) )
-            goto out;
-    }
-
-    /* Now, do a quick check to see if it may be zero before unmapping. */
-    for ( i=0; i<SUPERPAGE_PAGES; i++ )
-    {
-        /* Quick zero-check */
-        map = map_domain_page(mfn_x(mfn0) + i);
-
-        for ( j=0; j<16; j++ )
-            if( *(map+j) != 0 )
-                break;
-
-        unmap_domain_page(map);
-
-        if ( j < 16 )
-            goto out;
-
-    }
-
-    /* Try to remove the page, restoring old mapping if it fails. */
-    set_p2m_entry(p2m, gfn,
-                  _mfn(POPULATE_ON_DEMAND_MFN), 9,
-                  p2m_populate_on_demand, p2m->default_access);
-
-    /* Make none of the MFNs are used elsewhere... for example, mapped
-     * via the grant table interface, or by qemu.  Allow one refcount for
-     * being allocated to the domain. */
-    for ( i=0; i < SUPERPAGE_PAGES; i++ )
-    {
-        mfn = _mfn(mfn_x(mfn0) + i);
-        if ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > 1 )
-        {
-            reset = 1;
-            goto out_reset;
-        }
-    }
-
-    /* Finally, do a full zero-check */
-    for ( i=0; i < SUPERPAGE_PAGES; i++ )
-    {
-        map = map_domain_page(mfn_x(mfn0) + i);
-
-        for ( j=0; j<PAGE_SIZE/sizeof(*map); j++ )
-            if( *(map+j) != 0 )
-            {
-                reset = 1;
-                break;
-            }
-
-        unmap_domain_page(map);
-
-        if ( reset )
-            goto out_reset;
-    }
-
-    if ( tb_init_done )
-    {
-        struct {
-            u64 gfn, mfn;
-            int d:16,order:16;
-        } t;
-
-        t.gfn = gfn;
-        t.mfn = mfn_x(mfn);
-        t.d = d->domain_id;
-        t.order = 9;
-
-        __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t);
-    }
-
-    /* Finally!  We've passed all the checks, and can add the mfn superpage
-     * back on the PoD cache, and account for the new p2m PoD entries */
-    p2m_pod_cache_add(p2m, mfn_to_page(mfn0), 9);
-    p2m->pod.entry_count += SUPERPAGE_PAGES;
-
-out_reset:
-    if ( reset )
-        set_p2m_entry(p2m, gfn, mfn0, 9, type0, p2m->default_access);
-    
-out:
-    return ret;
-}
-
-static void
-p2m_pod_zero_check(struct p2m_domain *p2m, unsigned long *gfns, int count)
-{
-    mfn_t mfns[count];
-    p2m_type_t types[count];
-    unsigned long * map[count];
-    struct domain *d = p2m->domain;
-
-    int i, j;
-    int max_ref = 1;
-
-    /* Allow an extra refcount for one shadow pt mapping in shadowed domains */
-    if ( paging_mode_shadow(d) )
-        max_ref++;
-
-    /* First, get the gfn list, translate to mfns, and map the pages. */
-    for ( i=0; i<count; i++ )
-    {
-        mfns[i] = gfn_to_mfn_query(p2m, gfns[i], types + i);
-        /* If this is ram, and not a pagetable or from the xen heap, and 
probably not mapped
-           elsewhere, map it; otherwise, skip. */
-        if ( p2m_is_ram(types[i])
-             && ( (mfn_to_page(mfns[i])->count_info & PGC_allocated) != 0 ) 
-             && ( (mfn_to_page(mfns[i])->count_info & 
(PGC_page_table|PGC_xen_heap)) == 0 ) 
-             && ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) <= 
max_ref ) )
-            map[i] = map_domain_page(mfn_x(mfns[i]));
-        else
-            map[i] = NULL;
-    }
-
-    /* Then, go through and check for zeroed pages, removing write permission
-     * for those with zeroes. */
-    for ( i=0; i<count; i++ )
-    {
-        if(!map[i])
-            continue;
-
-        /* Quick zero-check */
-        for ( j=0; j<16; j++ )
-            if( *(map[i]+j) != 0 )
-                break;
-
-        if ( j < 16 )
-        {
-            unmap_domain_page(map[i]);
-            map[i] = NULL;
-            continue;
-        }
-
-        /* Try to remove the page, restoring old mapping if it fails. */
-        set_p2m_entry(p2m, gfns[i],
-                      _mfn(POPULATE_ON_DEMAND_MFN), 0,
-                      p2m_populate_on_demand, p2m->default_access);
-
-        /* See if the page was successfully unmapped.  (Allow one refcount
-         * for being allocated to a domain.) */
-        if ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) > 1 )
-        {
-            unmap_domain_page(map[i]);
-            map[i] = NULL;
-
-            set_p2m_entry(p2m, gfns[i], mfns[i], 0, types[i], 
p2m->default_access);
-
-            continue;
-        }
-    }
-
-    /* Now check each page for real */
-    for ( i=0; i < count; i++ )
-    {
-        if(!map[i])
-            continue;
-
-        for ( j=0; j<PAGE_SIZE/sizeof(*map[i]); j++ )
-            if( *(map[i]+j) != 0 )
-                break;
-
-        unmap_domain_page(map[i]);
-
-        /* See comment in p2m_pod_zero_check_superpage() re gnttab
-         * check timing.  */
-        if ( j < PAGE_SIZE/sizeof(*map[i]) )
-        {
-            set_p2m_entry(p2m, gfns[i], mfns[i], 0, types[i], 
p2m->default_access);
-        }
-        else
-        {
-            if ( tb_init_done )
-            {
-                struct {
-                    u64 gfn, mfn;
-                    int d:16,order:16;
-                } t;
-
-                t.gfn = gfns[i];
-                t.mfn = mfn_x(mfns[i]);
-                t.d = d->domain_id;
-                t.order = 0;
-        
-                __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t);
-            }
-
-            /* Add to cache, and account for the new p2m PoD entry */
-            p2m_pod_cache_add(p2m, mfn_to_page(mfns[i]), 0);
-            p2m->pod.entry_count++;
-        }
-    }
-    
-}
-
-#define POD_SWEEP_LIMIT 1024
-static void
-p2m_pod_emergency_sweep_super(struct p2m_domain *p2m)
-{
-    unsigned long i, start, limit;
-
-    if ( p2m->pod.reclaim_super == 0 )
-    {
-        p2m->pod.reclaim_super = (p2m->pod.max_guest>>9)<<9;
-        p2m->pod.reclaim_super -= SUPERPAGE_PAGES;
-    }
-    
-    start = p2m->pod.reclaim_super;
-    limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0;
-
-    for ( i=p2m->pod.reclaim_super ; i > 0 ; i -= SUPERPAGE_PAGES )
-    {
-        p2m_pod_zero_check_superpage(p2m, i);
-        /* Stop if we're past our limit and we have found *something*.
-         *
-         * NB that this is a zero-sum game; we're increasing our cache size
-         * by increasing our 'debt'.  Since we hold the p2m lock,
-         * (entry_count - count) must remain the same. */
-        if ( !page_list_empty(&p2m->pod.super) &&  i < limit )
-            break;
-    }
-
-    p2m->pod.reclaim_super = i ? i - SUPERPAGE_PAGES : 0;
-}
-
-#define POD_SWEEP_STRIDE  16
-static void
-p2m_pod_emergency_sweep(struct p2m_domain *p2m)
-{
-    unsigned long gfns[POD_SWEEP_STRIDE];
-    unsigned long i, j=0, start, limit;
-    p2m_type_t t;
-
-
-    if ( p2m->pod.reclaim_single == 0 )
-        p2m->pod.reclaim_single = p2m->pod.max_guest;
-
-    start = p2m->pod.reclaim_single;
-    limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0;
-
-    /* FIXME: Figure out how to avoid superpages */
-    for ( i=p2m->pod.reclaim_single; i > 0 ; i-- )
-    {
-        gfn_to_mfn_query(p2m, i, &t );
-        if ( p2m_is_ram(t) )
-        {
-            gfns[j] = i;
-            j++;
-            BUG_ON(j > POD_SWEEP_STRIDE);
-            if ( j == POD_SWEEP_STRIDE )
-            {
-                p2m_pod_zero_check(p2m, gfns, j);
-                j = 0;
-            }
-        }
-        /* Stop if we're past our limit and we have found *something*.
-         *
-         * NB that this is a zero-sum game; we're increasing our cache size
-         * by re-increasing our 'debt'.  Since we hold the p2m lock,
-         * (entry_count - count) must remain the same. */
-        if ( p2m->pod.count > 0 && i < limit )
-            break;
-    }
-
-    if ( j )
-        p2m_pod_zero_check(p2m, gfns, j);
-
-    p2m->pod.reclaim_single = i ? i - 1 : i;
-
-}
-
-int
-p2m_pod_demand_populate(struct p2m_domain *p2m, unsigned long gfn,
-                        unsigned int order,
-                        p2m_query_t q)
-{
-    struct domain *d = p2m->domain;
-    struct page_info *p = NULL; /* Compiler warnings */
-    unsigned long gfn_aligned;
-    mfn_t mfn;
-    int i;
-
-    ASSERT(p2m_locked_by_me(p2m));
-
-    /* This check is done with the p2m lock held.  This will make sure that
-     * even if d->is_dying changes under our feet, p2m_pod_empty_cache() 
-     * won't start until we're done. */
-    if ( unlikely(d->is_dying) )
-        goto out_fail;
-
-    /* Because PoD does not have cache list for 1GB pages, it has to remap
-     * 1GB region to 2MB chunks for a retry. */
-    if ( order == 18 )
-    {
-        gfn_aligned = (gfn >> order) << order;
-        /* Note that we are supposed to call set_p2m_entry() 512 times to 
-         * split 1GB into 512 2MB pages here. But We only do once here because
-         * set_p2m_entry() should automatically shatter the 1GB page into 
-         * 512 2MB pages. The rest of 511 calls are unnecessary.
-         */
-        set_p2m_entry(p2m, gfn_aligned, _mfn(POPULATE_ON_DEMAND_MFN), 9,
-                      p2m_populate_on_demand, p2m->default_access);
-        audit_p2m(p2m, 1);
-        p2m_unlock(p2m);
-        return 0;
-    }
-
-    /* Once we've ballooned down enough that we can fill the remaining
-     * PoD entries from the cache, don't sweep even if the particular
-     * list we want to use is empty: that can lead to thrashing zero pages 
-     * through the cache for no good reason.  */
-    if ( p2m->pod.entry_count > p2m->pod.count )
-    {
-
-        /* If we're low, start a sweep */
-        if ( order == 9 && page_list_empty(&p2m->pod.super) )
-            p2m_pod_emergency_sweep_super(p2m);
-
-        if ( page_list_empty(&p2m->pod.single) &&
-             ( ( order == 0 )
-               || (order == 9 && page_list_empty(&p2m->pod.super) ) ) )
-            p2m_pod_emergency_sweep(p2m);
-    }
-
-    /* Keep track of the highest gfn demand-populated by a guest fault */
-    if ( q == p2m_guest && gfn > p2m->pod.max_guest )
-        p2m->pod.max_guest = gfn;
-
-    spin_lock(&d->page_alloc_lock);
-
-    if ( p2m->pod.count == 0 )
-        goto out_of_memory;
-
-    /* Get a page f/ the cache.  A NULL return value indicates that the
-     * 2-meg range should be marked singleton PoD, and retried */
-    if ( (p = p2m_pod_cache_get(p2m, order)) == NULL )
-        goto remap_and_retry;
-
-    mfn = page_to_mfn(p);
-
-    BUG_ON((mfn_x(mfn) & ((1 << order)-1)) != 0);
-
-    spin_unlock(&d->page_alloc_lock);
-
-    gfn_aligned = (gfn >> order) << order;
-
-    set_p2m_entry(p2m, gfn_aligned, mfn, order, p2m_ram_rw, 
p2m->default_access);
-
-    for( i = 0; i < (1UL << order); i++ )
-    {
-        set_gpfn_from_mfn(mfn_x(mfn) + i, gfn_aligned + i);
-        paging_mark_dirty(d, mfn_x(mfn) + i);
-    }
-    
-    p2m->pod.entry_count -= (1 << order); /* Lock: p2m */
-    BUG_ON(p2m->pod.entry_count < 0);
-
-    if ( tb_init_done )
-    {
-        struct {
-            u64 gfn, mfn;
-            int d:16,order:16;
-        } t;
-
-        t.gfn = gfn;
-        t.mfn = mfn_x(mfn);
-        t.d = d->domain_id;
-        t.order = order;
-        
-        __trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t);
-    }
-
-    return 0;
-out_of_memory:
-    spin_unlock(&d->page_alloc_lock);
-
-    printk("%s: Out of populate-on-demand memory! tot_pages %" PRIu32 " 
pod_entries %" PRIi32 "\n",
-           __func__, d->tot_pages, p2m->pod.entry_count);
-    domain_crash(d);
-out_fail:
-    return -1;
-remap_and_retry:
-    BUG_ON(order != 9);
-    spin_unlock(&d->page_alloc_lock);
-
-    /* Remap this 2-meg region in singleton chunks */
-    gfn_aligned = (gfn>>order)<<order;
-    for(i=0; i<(1<<order); i++)
-        set_p2m_entry(p2m, gfn_aligned+i, _mfn(POPULATE_ON_DEMAND_MFN), 0,
-                      p2m_populate_on_demand, p2m->default_access);
-    if ( tb_init_done )
-    {
-        struct {
-            u64 gfn;
-            int d:16;
-        } t;
-
-        t.gfn = gfn;
-        t.d = d->domain_id;
-        
-        __trace_var(TRC_MEM_POD_SUPERPAGE_SPLINTER, 0, sizeof(t), &t);
-    }
-
-    return 0;
-}
-
-/* Non-ept "lock-and-check" wrapper */
-static int p2m_pod_check_and_populate(struct p2m_domain *p2m, unsigned long 
gfn,
-                                      l1_pgentry_t *p2m_entry, int order,
-                                      p2m_query_t q)
-{
-    /* Only take the lock if we don't already have it.  Otherwise it
-     * wouldn't be safe to do p2m lookups with the p2m lock held */
-    int do_locking = !p2m_locked_by_me(p2m);
-    int r;
-
-    if ( do_locking )
-        p2m_lock(p2m);
-
-    audit_p2m(p2m, 1);
-
-    /* Check to make sure this is still PoD */
-    if ( p2m_flags_to_type(l1e_get_flags(*p2m_entry)) != 
p2m_populate_on_demand )
-    {
-        if ( do_locking )
-            p2m_unlock(p2m);
-        return 0;
-    }
-
-    r = p2m_pod_demand_populate(p2m, gfn, order, q);
-
-    audit_p2m(p2m, 1);
-    if ( do_locking )
-        p2m_unlock(p2m);
-
-    return r;
-}
-
-// Returns 0 on error (out of memory)
-static int
-p2m_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, 
-              unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma)
-{
-    // XXX -- this might be able to be faster iff current->domain == d
-    mfn_t table_mfn = pagetable_get_mfn(p2m_get_pagetable(p2m));
-    void *table =map_domain_page(mfn_x(table_mfn));
-    unsigned long i, gfn_remainder = gfn;
-    l1_pgentry_t *p2m_entry;
-    l1_pgentry_t entry_content;
-    l2_pgentry_t l2e_content;
-    l3_pgentry_t l3e_content;
-    int rv=0;
-    unsigned int iommu_pte_flags = (p2mt == p2m_ram_rw) ?
-                                   IOMMUF_readable|IOMMUF_writable:
-                                   0; 
-    unsigned long old_mfn = 0;
-
-    if ( tb_init_done )
-    {
-        struct {
-            u64 gfn, mfn;
-            int p2mt;
-            int d:16,order:16;
-        } t;
-
-        t.gfn = gfn;
-        t.mfn = mfn_x(mfn);
-        t.p2mt = p2mt;
-        t.d = p2m->domain->domain_id;
-        t.order = page_order;
-
-        __trace_var(TRC_MEM_SET_P2M_ENTRY, 0, sizeof(t), &t);
-    }
-
-#if CONFIG_PAGING_LEVELS >= 4
-    if ( !p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn,
-                         L4_PAGETABLE_SHIFT - PAGE_SHIFT,
-                         L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
-        goto out;
-#endif
-    /*
-     * Try to allocate 1GB page table if this feature is supported.
-     */
-    if ( page_order == 18 )
-    {
-        l1_pgentry_t old_entry = l1e_empty();
-        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
-                                   L3_PAGETABLE_SHIFT - PAGE_SHIFT,
-                                   L3_PAGETABLE_ENTRIES);
-        ASSERT(p2m_entry);
-        if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
-             !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
-        {
-            /* We're replacing a non-SP page with a superpage.  Make sure to
-             * handle freeing the table properly. */
-            old_entry = *p2m_entry;
-        }
-
-        ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
-        l3e_content = mfn_valid(mfn) 
-            ? l3e_from_pfn(mfn_x(mfn),
-                           p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE)
-            : l3e_empty();
-        entry_content.l1 = l3e_content.l3;
-
-        if ( entry_content.l1 != 0 )
-        {
-            p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
-            old_mfn = l1e_get_pfn(*p2m_entry);
-        }
-
-        p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 3);
-        /* NB: paging_write_p2m_entry() handles tlb flushes properly */
-
-        /* Free old intermediate tables if necessary */
-        if ( l1e_get_flags(old_entry) & _PAGE_PRESENT )
-            p2m_free_entry(p2m, &old_entry, page_order);
-    }
-    /*
-     * When using PAE Xen, we only allow 33 bits of pseudo-physical
-     * address in translated guests (i.e. 8 GBytes).  This restriction
-     * comes from wanting to map the P2M table into the 16MB RO_MPT hole
-     * in Xen's address space for translated PV guests.
-     * When using AMD's NPT on PAE Xen, we are restricted to 4GB.
-     */
-    else if ( !p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn,
-                              L3_PAGETABLE_SHIFT - PAGE_SHIFT,
-                              ((CONFIG_PAGING_LEVELS == 3)
-                               ? (hap_enabled(p2m->domain) ? 4 : 8)
-                               : L3_PAGETABLE_ENTRIES),
-                              PGT_l2_page_table) )
-        goto out;
-
-    if ( page_order == 0 )
-    {
-        if ( !p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn,
-                             L2_PAGETABLE_SHIFT - PAGE_SHIFT,
-                             L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
-            goto out;
-
-        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
-                                   0, L1_PAGETABLE_ENTRIES);
-        ASSERT(p2m_entry);
-        
-        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
-            entry_content = l1e_from_pfn(mfn_x(mfn),
-                                         p2m_type_to_flags(p2mt, mfn));
-        else
-            entry_content = l1e_empty();
-
-        if ( entry_content.l1 != 0 )
-        {
-            p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
-            old_mfn = l1e_get_pfn(*p2m_entry);
-        }
-        /* level 1 entry */
-        p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 1);
-        /* NB: paging_write_p2m_entry() handles tlb flushes properly */
-    }
-    else if ( page_order == 9 )
-    {
-        l1_pgentry_t old_entry = l1e_empty();
-        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
-                                   L2_PAGETABLE_SHIFT - PAGE_SHIFT,
-                                   L2_PAGETABLE_ENTRIES);
-        ASSERT(p2m_entry);
-        
-        /* FIXME: Deal with 4k replaced by 2meg pages */
-        if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
-             !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
-        {
-            /* We're replacing a non-SP page with a superpage.  Make sure to
-             * handle freeing the table properly. */
-            old_entry = *p2m_entry;
-        }
-        
-        ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
-        if ( mfn_valid(mfn) || p2m_is_magic(p2mt) )
-            l2e_content = l2e_from_pfn(mfn_x(mfn),
-                                       p2m_type_to_flags(p2mt, mfn) |
-                                       _PAGE_PSE);
-        else
-            l2e_content = l2e_empty();
-        
-        entry_content.l1 = l2e_content.l2;
-
-        if ( entry_content.l1 != 0 )
-        {
-            p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
-            old_mfn = l1e_get_pfn(*p2m_entry);
-        }
-
-        p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 2);
-        /* NB: paging_write_p2m_entry() handles tlb flushes properly */
-
-        /* Free old intermediate tables if necessary */
-        if ( l1e_get_flags(old_entry) & _PAGE_PRESENT )
-            p2m_free_entry(p2m, &old_entry, page_order);
-    }
-
-    /* Track the highest gfn for which we have ever had a valid mapping */
-    if ( mfn_valid(mfn) 
-         && (gfn + (1UL << page_order) - 1 > p2m->max_mapped_pfn) )
-        p2m->max_mapped_pfn = gfn + (1UL << page_order) - 1;
-
-    if ( iommu_enabled && need_iommu(p2m->domain) )
-    {
-        if ( iommu_hap_pt_share )
-        {
-            if ( old_mfn && (old_mfn != mfn_x(mfn)) )
-                amd_iommu_flush_pages(p2m->domain, gfn, page_order);
-        }
-        else
-        {
-            if ( p2mt == p2m_ram_rw )
-                for ( i = 0; i < (1UL << page_order); i++ )
-                    iommu_map_page(p2m->domain, gfn+i, mfn_x(mfn)+i,
-                                   IOMMUF_readable|IOMMUF_writable);
-            else
-                for ( int i = 0; i < (1UL << page_order); i++ )
-                    iommu_unmap_page(p2m->domain, gfn+i);
-        }
-    }
-
-    /* Success */
-    rv = 1;
-
-out:
-    unmap_domain_page(table);
-    return rv;
-}
-
-static mfn_t
-p2m_gfn_to_mfn(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, 
p2m_access_t *a,
-               p2m_query_t q)
-{
-    mfn_t mfn;
-    paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
-    l2_pgentry_t *l2e;
-    l1_pgentry_t *l1e;
-
-    ASSERT(paging_mode_translate(p2m->domain));
-
-    /* XXX This is for compatibility with the old model, where anything not 
-     * XXX marked as RAM was considered to be emulated MMIO space.
-     * XXX Once we start explicitly registering MMIO regions in the p2m 
-     * XXX we will return p2m_invalid for unmapped gfns */
-    *t = p2m_mmio_dm;
-    /* Not implemented except with EPT */
-    *a = p2m_access_rwx; 
-
-    mfn = pagetable_get_mfn(p2m_get_pagetable(p2m));
-
-    if ( gfn > p2m->max_mapped_pfn )
-        /* This pfn is higher than the highest the p2m map currently holds */
-        return _mfn(INVALID_MFN);
-
-#if CONFIG_PAGING_LEVELS >= 4
-    {
-        l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn));
-        l4e += l4_table_offset(addr);
-        if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
-        {
-            unmap_domain_page(l4e);
-            return _mfn(INVALID_MFN);
-        }
-        mfn = _mfn(l4e_get_pfn(*l4e));
-        unmap_domain_page(l4e);
-    }
-#endif
-    {
-        l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn));
-#if CONFIG_PAGING_LEVELS == 3
-        /* On PAE hosts the p2m has eight l3 entries, not four (see
-         * shadow_set_p2m_entry()) so we can't use l3_table_offset.
-         * Instead, just count the number of l3es from zero.  It's safe
-         * to do this because we already checked that the gfn is within
-         * the bounds of the p2m. */
-        l3e += (addr >> L3_PAGETABLE_SHIFT);
-#else
-        l3e += l3_table_offset(addr);
-#endif
-pod_retry_l3:
-        if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
-        {
-            if ( p2m_flags_to_type(l3e_get_flags(*l3e)) == 
p2m_populate_on_demand )
-            {
-                if ( q != p2m_query )
-                {
-                    if ( !p2m_pod_demand_populate(p2m, gfn, 18, q) )
-                        goto pod_retry_l3;
-                }
-                else
-                    *t = p2m_populate_on_demand;
-            }
-            unmap_domain_page(l3e);
-            return _mfn(INVALID_MFN);
-        }
-        else if ( (l3e_get_flags(*l3e) & _PAGE_PSE) )
-        {
-            mfn = _mfn(l3e_get_pfn(*l3e) +
-                       l2_table_offset(addr) * L1_PAGETABLE_ENTRIES +
-                       l1_table_offset(addr));
-            *t = p2m_flags_to_type(l3e_get_flags(*l3e));
-            unmap_domain_page(l3e);
-
-            ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
-            return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
-        }
-
-        mfn = _mfn(l3e_get_pfn(*l3e));
-        unmap_domain_page(l3e);
-    }
-
-    l2e = map_domain_page(mfn_x(mfn));
-    l2e += l2_table_offset(addr);
-
-pod_retry_l2:
-    if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
-    {
-        /* PoD: Try to populate a 2-meg chunk */
-        if ( p2m_flags_to_type(l2e_get_flags(*l2e)) == p2m_populate_on_demand )
-        {
-            if ( q != p2m_query ) {
-                if ( !p2m_pod_check_and_populate(p2m, gfn,
-                                                 (l1_pgentry_t *)l2e, 9, q) )
-                    goto pod_retry_l2;
-            } else
-                *t = p2m_populate_on_demand;
-        }
-    
-        unmap_domain_page(l2e);
-        return _mfn(INVALID_MFN);
-    }
-    else if ( (l2e_get_flags(*l2e) & _PAGE_PSE) )
-    {
-        mfn = _mfn(l2e_get_pfn(*l2e) + l1_table_offset(addr));
-        *t = p2m_flags_to_type(l2e_get_flags(*l2e));
-        unmap_domain_page(l2e);
-        
-        ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
-        return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
-    }
-
-    mfn = _mfn(l2e_get_pfn(*l2e));
-    unmap_domain_page(l2e);
-
-    l1e = map_domain_page(mfn_x(mfn));
-    l1e += l1_table_offset(addr);
-pod_retry_l1:
-    if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
-    {
-        /* PoD: Try to populate */
-        if ( p2m_flags_to_type(l1e_get_flags(*l1e)) == p2m_populate_on_demand )
-        {
-            if ( q != p2m_query ) {
-                if ( !p2m_pod_check_and_populate(p2m, gfn,
-                                                 (l1_pgentry_t *)l1e, 0, q) )
-                    goto pod_retry_l1;
-            } else
-                *t = p2m_populate_on_demand;
-        }
-    
-        unmap_domain_page(l1e);
-        return _mfn(INVALID_MFN);
-    }
-    mfn = _mfn(l1e_get_pfn(*l1e));
-    *t = p2m_flags_to_type(l1e_get_flags(*l1e));
-    unmap_domain_page(l1e);
-
-    ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
-    return (p2m_is_valid(*t) || p2m_is_grant(*t)) ? mfn : _mfn(INVALID_MFN);
-}
-
-/* Read the current domain's p2m table (through the linear mapping). */
-static mfn_t p2m_gfn_to_mfn_current(struct p2m_domain *p2m,
-                                    unsigned long gfn, p2m_type_t *t, 
p2m_access_t *a,
-                                    p2m_query_t q)
-{
-    mfn_t mfn = _mfn(INVALID_MFN);
-    p2m_type_t p2mt = p2m_mmio_dm;
-    paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
-    /* XXX This is for compatibility with the old model, where anything not 
-     * XXX marked as RAM was considered to be emulated MMIO space.
-     * XXX Once we start explicitly registering MMIO regions in the p2m 
-     * XXX we will return p2m_invalid for unmapped gfns */
-
-    /* Not currently implemented except for EPT */
-    *a = p2m_access_rwx;
-
-    if ( gfn <= p2m->max_mapped_pfn )
-    {
-        l1_pgentry_t l1e = l1e_empty(), *p2m_entry;
-        l2_pgentry_t l2e = l2e_empty();
-        int ret;
-#if CONFIG_PAGING_LEVELS >= 4
-        l3_pgentry_t l3e = l3e_empty();
-#endif
-
-        ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START) 
-               / sizeof(l1_pgentry_t));
-
-#if CONFIG_PAGING_LEVELS >= 4
-        /*
-         * Read & process L3
-         */
-        p2m_entry = (l1_pgentry_t *)
-            &__linear_l2_table[l2_linear_offset(RO_MPT_VIRT_START)
-                               + l3_linear_offset(addr)];
-    pod_retry_l3:
-        ret = __copy_from_user(&l3e, p2m_entry, sizeof(l3e));
-
-        if ( ret != 0 || !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
-        {
-            if ( (l3e_get_flags(l3e) & _PAGE_PSE) &&
-                 (p2m_flags_to_type(l3e_get_flags(l3e)) == 
p2m_populate_on_demand) )
-            {
-                /* The read has succeeded, so we know that mapping exists */
-                if ( q != p2m_query )
-                {
-                    if ( !p2m_pod_demand_populate(p2m, gfn, 18, q) )
-                        goto pod_retry_l3;
-                    p2mt = p2m_invalid;
-                    printk("%s: Allocate 1GB failed!\n", __func__);
-                    goto out;
-                }
-                else
-                {
-                    p2mt = p2m_populate_on_demand;
-                    goto out;
-                }
-            }
-            goto pod_retry_l2;
-        }
-
-        if ( l3e_get_flags(l3e) & _PAGE_PSE )
-        {
-            p2mt = p2m_flags_to_type(l3e_get_flags(l3e));
-            ASSERT(l3e_get_pfn(l3e) != INVALID_MFN || !p2m_is_ram(p2mt));
-            if (p2m_is_valid(p2mt) )
-                mfn = _mfn(l3e_get_pfn(l3e) + 
-                           l2_table_offset(addr) * L1_PAGETABLE_ENTRIES + 
-                           l1_table_offset(addr));
-            else
-                p2mt = p2m_mmio_dm;
-            
-            goto out;
-        }
-#endif
-        /*
-         * Read & process L2
-         */
-        p2m_entry = &__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START)
-                                       + l2_linear_offset(addr)];
-
-    pod_retry_l2:
-        ret = __copy_from_user(&l2e,
-                               p2m_entry,
-                               sizeof(l2e));
-        if ( ret != 0
-             || !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
-        {
-            if( (l2e_get_flags(l2e) & _PAGE_PSE)
-                && ( p2m_flags_to_type(l2e_get_flags(l2e))
-                     == p2m_populate_on_demand ) )
-            {
-                /* The read has succeeded, so we know that the mapping
-                 * exits at this point.  */
-                if ( q != p2m_query )
-                {
-                    if ( !p2m_pod_check_and_populate(p2m, gfn,
-                                                     p2m_entry, 9, q) )
-                        goto pod_retry_l2;
-
-                    /* Allocate failed. */
-                    p2mt = p2m_invalid;
-                    printk("%s: Allocate failed!\n", __func__);
-                    goto out;
-                }
-                else
-                {
-                    p2mt = p2m_populate_on_demand;
-                    goto out;
-                }
-            }
-
-            goto pod_retry_l1;
-        }
-        
-        if (l2e_get_flags(l2e) & _PAGE_PSE)
-        {
-            p2mt = p2m_flags_to_type(l2e_get_flags(l2e));
-            ASSERT(l2e_get_pfn(l2e) != INVALID_MFN || !p2m_is_ram(p2mt));
-
-            if ( p2m_is_valid(p2mt) )
-                mfn = _mfn(l2e_get_pfn(l2e) + l1_table_offset(addr));
-            else
-                p2mt = p2m_mmio_dm;
-
-            goto out;
-        }
-
-        /*
-         * Read and process L1
-         */
-
-        /* Need to __copy_from_user because the p2m is sparse and this
-         * part might not exist */
-    pod_retry_l1:
-        p2m_entry = &phys_to_machine_mapping[gfn];
-
-        ret = __copy_from_user(&l1e,
-                               p2m_entry,
-                               sizeof(l1e));
-            
-        if ( ret == 0 ) {
-            p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
-            ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
-
-            if ( p2m_flags_to_type(l1e_get_flags(l1e))
-                 == p2m_populate_on_demand )
-            {
-                /* The read has succeeded, so we know that the mapping
-                 * exits at this point.  */
-                if ( q != p2m_query )
-                {
-                    if ( !p2m_pod_check_and_populate(p2m, gfn,
-                                                     (l1_pgentry_t 
*)p2m_entry, 0, q) )
-                        goto pod_retry_l1;
-
-                    /* Allocate failed. */
-                    p2mt = p2m_invalid;
-                    goto out;
-                }
-                else
-                {
-                    p2mt = p2m_populate_on_demand;
-                    goto out;
-                }
-            }
-
-            if ( p2m_is_valid(p2mt) || p2m_is_grant(p2mt) )
-                mfn = _mfn(l1e_get_pfn(l1e));
-            else 
-                /* XXX see above */
-                p2mt = p2m_mmio_dm;
-        }
-    }
-out:
-    *t = p2mt;
-    return mfn;
-}
+/* XXX declare functions moved to p2m-pt.c */
+extern void p2m_pt_init(struct p2m_domain *p2m);
 
 /* Init the datastructures for later use by the p2m code */
 static void p2m_initialise(struct domain *d, struct p2m_domain *p2m)
@@ -1930,15 +87,12 @@ static void p2m_initialise(struct domain
     p2m->default_access = p2m_access_rwx;
 
     p2m->cr3 = CR3_EADDR;
-    p2m->set_entry = p2m_set_entry;
-    p2m->get_entry = p2m_gfn_to_mfn;
-    p2m->get_entry_current = p2m_gfn_to_mfn_current;
-    p2m->change_entry_type_global = p2m_change_type_global;
-    p2m->write_p2m_entry = paging_write_p2m_entry;
     cpus_clear(p2m->p2m_dirty_cpumask);
 
     if ( hap_enabled(d) && (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) )
         ept_p2m_init(d);
+    else
+        p2m_pt_init(p2m);
 
     return;
 }
@@ -1986,7 +140,6 @@ void p2m_change_entry_type_global(struct
     p2m_unlock(p2m);
 }
 
-static
 int set_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, 
                   unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma)
 {
@@ -2162,275 +315,6 @@ void p2m_final_teardown(struct domain *d
     p2m_teardown_nestedp2m(d);
 }
 
-#if P2M_AUDIT
-/* strict_m2p == 0 allows m2p mappings that don'#t match the p2m. 
- * It's intended for add_to_physmap, when the domain has just been allocated 
- * new mfns that might have stale m2p entries from previous owners */
-static void audit_p2m(struct p2m_domain *p2m, int strict_m2p)
-{
-    struct page_info *page;
-    struct domain *od;
-    unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
-    int entry_count = 0;
-    mfn_t p2mfn;
-    unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
-    int test_linear;
-    p2m_type_t type;
-    struct domain *d = p2m->domain;
-
-    if ( !paging_mode_translate(d) )
-        return;
-
-    //P2M_PRINTK("p2m audit starts\n");
-
-    test_linear = ( (d == current->domain)
-                    && !pagetable_is_null(current->arch.monitor_table) );
-    if ( test_linear )
-        flush_tlb_local();
-
-    spin_lock(&d->page_alloc_lock);
-
-    /* Audit part one: walk the domain's page allocation list, checking
-     * the m2p entries. */
-    page_list_for_each ( page, &d->page_list )
-    {
-        mfn = mfn_x(page_to_mfn(page));
-
-        // P2M_PRINTK("auditing guest page, mfn=%#lx\n", mfn);
-
-        od = page_get_owner(page);
-
-        if ( od != d )
-        {
-            P2M_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n",
-                       mfn, od, (od?od->domain_id:-1), d, d->domain_id);
-            continue;
-        }
-
-        gfn = get_gpfn_from_mfn(mfn);
-        if ( gfn == INVALID_M2P_ENTRY )
-        {
-            orphans_i++;
-            //P2M_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n",
-            //               mfn);
-            continue;
-        }
-
-        if ( gfn == 0x55555555 || gfn == 0x5555555555555555 )
-        {
-            orphans_d++;
-            //P2M_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n",
-            //               mfn);
-            continue;
-        }
-
-        if ( gfn == SHARED_M2P_ENTRY )
-        {
-            P2M_PRINTK("shared mfn (%lx) on domain page list!\n",
-                    mfn);
-            continue;
-        }
-
-        p2mfn = gfn_to_mfn_type_p2m(p2m, gfn, &type, p2m_query);
-        if ( strict_m2p && mfn_x(p2mfn) != mfn )
-        {
-            mpbad++;
-            P2M_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx"
-                       " (-> gfn %#lx)\n",
-                       mfn, gfn, mfn_x(p2mfn),
-                       (mfn_valid(p2mfn)
-                        ? get_gpfn_from_mfn(mfn_x(p2mfn))
-                        : -1u));
-            /* This m2p entry is stale: the domain has another frame in
-             * this physical slot.  No great disaster, but for neatness,
-             * blow away the m2p entry. */
-            set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
-        }
-
-        if ( test_linear && (gfn <= p2m->max_mapped_pfn) )
-        {
-            lp2mfn = mfn_x(gfn_to_mfn_query(p2m, gfn, &type));
-            if ( lp2mfn != mfn_x(p2mfn) )
-            {
-                P2M_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
-                           "(!= mfn %#lx)\n", gfn, lp2mfn, mfn_x(p2mfn));
-            }
-        }
-
-        // P2M_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n",
-        //                mfn, gfn, mfn_x(p2mfn), lp2mfn);
-    }
-
-    spin_unlock(&d->page_alloc_lock);
-
-    /* Audit part two: walk the domain's p2m table, checking the entries. */
-    if ( pagetable_get_pfn(p2m_get_pagetable(p2m)) != 0 )
-    {
-        l2_pgentry_t *l2e;
-        l1_pgentry_t *l1e;
-        int i1, i2;
-
-#if CONFIG_PAGING_LEVELS == 4
-        l4_pgentry_t *l4e;
-        l3_pgentry_t *l3e;
-        int i4, i3;
-        l4e = 
map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
-#else /* CONFIG_PAGING_LEVELS == 3 */
-        l3_pgentry_t *l3e;
-        int i3;
-        l3e = 
map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
-#endif
-
-        gfn = 0;
-#if CONFIG_PAGING_LEVELS >= 4
-        for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
-        {
-            if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
-            {
-                gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
-                continue;
-            }
-            l3e = map_domain_page(mfn_x(_mfn(l4e_get_pfn(l4e[i4]))));
-#endif
-            for ( i3 = 0;
-                  i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8);
-                  i3++ )
-            {
-                if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
-                {
-                    gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
-                    continue;
-                }
-
-                /* check for 1GB super page */
-                if ( l3e_get_flags(l3e[i3]) & _PAGE_PSE )
-                {
-                    mfn = l3e_get_pfn(l3e[i3]);
-                    ASSERT(mfn_valid(_mfn(mfn)));
-                    /* we have to cover 512x512 4K pages */
-                    for ( i2 = 0; 
-                          i2 < (L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES);
-                          i2++)
-                    {
-                        m2pfn = get_gpfn_from_mfn(mfn+i2);
-                        if ( m2pfn != (gfn + i2) )
-                        {
-                            pmbad++;
-                            P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
-                                       " -> gfn %#lx\n", gfn+i2, mfn+i2,
-                                       m2pfn);
-                            BUG();
-                        }
-                        gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
-                        continue;
-                    }
-                }
-
-                l2e = map_domain_page(mfn_x(_mfn(l3e_get_pfn(l3e[i3]))));
-                for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
-                {
-                    if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
-                    {
-                        if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE)
-                             && ( p2m_flags_to_type(l2e_get_flags(l2e[i2]))
-                                  == p2m_populate_on_demand ) )
-                            entry_count+=SUPERPAGE_PAGES;
-                        gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
-                        continue;
-                    }
-                    
-                    /* check for super page */
-                    if ( l2e_get_flags(l2e[i2]) & _PAGE_PSE )
-                    {
-                        mfn = l2e_get_pfn(l2e[i2]);
-                        ASSERT(mfn_valid(_mfn(mfn)));
-                        for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++)
-                        {
-                            m2pfn = get_gpfn_from_mfn(mfn+i1);
-                            /* Allow shared M2Ps */
-                            if ( (m2pfn != (gfn + i1)) &&
-                                 (m2pfn != SHARED_M2P_ENTRY) )
-                            {
-                                pmbad++;
-                                P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
-                                           " -> gfn %#lx\n", gfn+i1, mfn+i1,
-                                           m2pfn);
-                                BUG();
-                            }
-                        }
-                        gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
-                        continue;
-                    }
-
-                    l1e = map_domain_page(mfn_x(_mfn(l2e_get_pfn(l2e[i2]))));
-
-                    for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
-                    {
-                        p2m_type_t type;
-
-                        type = p2m_flags_to_type(l1e_get_flags(l1e[i1]));
-                        if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
-                        {
-                            if ( type == p2m_populate_on_demand )
-                                entry_count++;
-                            continue;
-                        }
-                        mfn = l1e_get_pfn(l1e[i1]);
-                        ASSERT(mfn_valid(_mfn(mfn)));
-                        m2pfn = get_gpfn_from_mfn(mfn);
-                        if ( m2pfn != gfn &&
-                             type != p2m_mmio_direct &&
-                             !p2m_is_grant(type) &&
-                             !p2m_is_shared(type) )
-                        {
-                            pmbad++;
-                            printk("mismatch: gfn %#lx -> mfn %#lx"
-                                   " -> gfn %#lx\n", gfn, mfn, m2pfn);
-                            P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
-                                       " -> gfn %#lx\n", gfn, mfn, m2pfn);
-                            BUG();
-                        }
-                    }
-                    unmap_domain_page(l1e);
-                }
-                unmap_domain_page(l2e);
-            }
-#if CONFIG_PAGING_LEVELS >= 4
-            unmap_domain_page(l3e);
-        }
-#endif
-
-#if CONFIG_PAGING_LEVELS == 4
-        unmap_domain_page(l4e);
-#else /* CONFIG_PAGING_LEVELS == 3 */
-        unmap_domain_page(l3e);
-#endif
-
-    }
-
-    if ( entry_count != p2m->pod.entry_count )
-    {
-        printk("%s: refcounted entry count %d, audit count %d!\n",
-               __func__,
-               p2m->pod.entry_count,
-               entry_count);
-        BUG();
-    }
-        
-    //P2M_PRINTK("p2m audit complete\n");
-    //if ( orphans_i | orphans_d | mpbad | pmbad )
-    //    P2M_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
-    //                   orphans_i + orphans_d, orphans_i, orphans_d);
-    if ( mpbad | pmbad )
-    {
-        P2M_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
-                   pmbad, mpbad);
-        WARN();
-    }
-}
-#endif /* P2M_AUDIT */
-
-
 
 static void
 p2m_remove_page(struct p2m_domain *p2m, unsigned long gfn, unsigned long mfn,
@@ -2475,88 +359,6 @@ guest_physmap_remove_entry(struct p2m_do
     p2m_unlock(p2m);
 }
 
-#if CONFIG_PAGING_LEVELS == 3
-static int gfn_check_limit(
-    struct domain *d, unsigned long gfn, unsigned int order)
-{
-    /*
-     * 32bit AMD nested paging does not support over 4GB guest due to 
-     * hardware translation limit. This limitation is checked by comparing
-     * gfn with 0xfffffUL.
-     */
-    if ( !hap_enabled(d) || ((gfn + (1ul << order)) <= 0x100000UL) ||
-         (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) )
-        return 0;
-
-    if ( !test_and_set_bool(d->arch.hvm_domain.svm.npt_4gb_warning) )
-        dprintk(XENLOG_WARNING, "Dom%d failed to populate memory beyond"
-                " 4GB: specify 'hap=0' domain config option.\n",
-                d->domain_id);
-
-    return -EINVAL;
-}
-#else
-#define gfn_check_limit(d, g, o) 0
-#endif
-
-int
-guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
-                                      unsigned int order)
-{
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-    unsigned long i;
-    p2m_type_t ot;
-    mfn_t omfn;
-    int pod_count = 0;
-    int rc = 0;
-
-    BUG_ON(!paging_mode_translate(d));
-
-    rc = gfn_check_limit(d, gfn, order);
-    if ( rc != 0 )
-        return rc;
-
-    p2m_lock(p2m);
-    audit_p2m(p2m, 1);
-
-    P2M_DEBUG("mark pod gfn=%#lx\n", gfn);
-
-    /* Make sure all gpfns are unused */
-    for ( i = 0; i < (1UL << order); i++ )
-    {
-        omfn = gfn_to_mfn_query(p2m, gfn + i, &ot);
-        if ( p2m_is_ram(ot) )
-        {
-            printk("%s: gfn_to_mfn returned type %d!\n",
-                   __func__, ot);
-            rc = -EBUSY;
-            goto out;
-        }
-        else if ( ot == p2m_populate_on_demand )
-        {
-            /* Count how man PoD entries we'll be replacing if successful */
-            pod_count++;
-        }
-    }
-
-    /* Now, actually do the two-way mapping */
-    if ( !set_p2m_entry(p2m, gfn, _mfn(POPULATE_ON_DEMAND_MFN), order,
-                        p2m_populate_on_demand, p2m->default_access) )
-        rc = -EINVAL;
-    else
-    {
-        p2m->pod.entry_count += 1 << order; /* Lock: p2m */
-        p2m->pod.entry_count -= pod_count;
-        BUG_ON(p2m->pod.entry_count < 0);
-    }
-
-    audit_p2m(p2m, 1);
-    p2m_unlock(p2m);
-
-out:
-    return rc;
-}
-
 int
 guest_physmap_add_entry(struct p2m_domain *p2m, unsigned long gfn,
                         unsigned long mfn, unsigned int page_order, 
@@ -2588,7 +390,7 @@ guest_physmap_add_entry(struct p2m_domai
         return 0;
     }
 
-    rc = gfn_check_limit(d, gfn, page_order);
+    rc = p2m_gfn_check_limit(d, gfn, page_order);
     if ( rc != 0 )
         return rc;
 
@@ -2682,142 +484,6 @@ guest_physmap_add_entry(struct p2m_domai
     return rc;
 }
 
-/* Walk the whole p2m table, changing any entries of the old type
- * to the new type.  This is used in hardware-assisted paging to 
- * quickly enable or diable log-dirty tracking */
-void p2m_change_type_global(struct p2m_domain *p2m, p2m_type_t ot, p2m_type_t 
nt)
-{
-    unsigned long mfn, gfn, flags;
-    l1_pgentry_t l1e_content;
-    l1_pgentry_t *l1e;
-    l2_pgentry_t *l2e;
-    mfn_t l1mfn, l2mfn, l3mfn;
-    unsigned long i1, i2, i3;
-    l3_pgentry_t *l3e;
-#if CONFIG_PAGING_LEVELS == 4
-    l4_pgentry_t *l4e;
-    unsigned long i4;
-#endif /* CONFIG_PAGING_LEVELS == 4 */
-
-    BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
-    BUG_ON(ot != nt && (ot == p2m_mmio_direct || nt == p2m_mmio_direct));
-
-    if ( !paging_mode_translate(p2m->domain) )
-        return;
-
-    if ( pagetable_get_pfn(p2m_get_pagetable(p2m)) == 0 )
-        return;
-
-    ASSERT(p2m_locked_by_me(p2m));
-
-#if CONFIG_PAGING_LEVELS == 4
-    l4e = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
-#else /* CONFIG_PAGING_LEVELS == 3 */
-    l3mfn = _mfn(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
-    l3e = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 4
-    for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
-    {
-        if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
-        {
-            continue;
-        }
-        l3mfn = _mfn(l4e_get_pfn(l4e[i4]));
-        l3e = map_domain_page(l4e_get_pfn(l4e[i4]));
-#endif
-        for ( i3 = 0;
-              i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8);
-              i3++ )
-        {
-            if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
-            {
-                continue;
-            }
-            if ( (l3e_get_flags(l3e[i3]) & _PAGE_PSE) )
-            {
-                flags = l3e_get_flags(l3e[i3]);
-                if ( p2m_flags_to_type(flags) != ot )
-                    continue;
-                mfn = l3e_get_pfn(l3e[i3]);
-                gfn = get_gpfn_from_mfn(mfn);
-                flags = p2m_type_to_flags(nt, _mfn(mfn));
-                l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
-                p2m->write_p2m_entry(p2m, gfn,
-                                     (l1_pgentry_t *)&l3e[i3],
-                                     l3mfn, l1e_content, 3);
-                continue;
-            }
-
-            l2mfn = _mfn(l3e_get_pfn(l3e[i3]));
-            l2e = map_domain_page(l3e_get_pfn(l3e[i3]));
-            for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
-            {
-                if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
-                {
-                    continue;
-                }
-
-                if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE) )
-                {
-                    flags = l2e_get_flags(l2e[i2]);
-                    if ( p2m_flags_to_type(flags) != ot )
-                        continue;
-                    mfn = l2e_get_pfn(l2e[i2]);
-                    /* Do not use get_gpfn_from_mfn because it may return 
-                       SHARED_M2P_ENTRY */
-                    gfn = (i2 + (i3
-#if CONFIG_PAGING_LEVELS >= 4
-                                  + (i4 * L3_PAGETABLE_ENTRIES)
-#endif
-                               )
-                           * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; 
-                    flags = p2m_type_to_flags(nt, _mfn(mfn));
-                    l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
-                    p2m->write_p2m_entry(p2m, gfn,
-                                         (l1_pgentry_t *)&l2e[i2],
-                                         l2mfn, l1e_content, 2);
-                    continue;
-                }
-
-                l1mfn = _mfn(l2e_get_pfn(l2e[i2]));
-                l1e = map_domain_page(mfn_x(l1mfn));
-
-                for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
-                {
-                    flags = l1e_get_flags(l1e[i1]);
-                    if ( p2m_flags_to_type(flags) != ot )
-                        continue;
-                    mfn = l1e_get_pfn(l1e[i1]);
-                    gfn = i1 + (i2 + (i3
-#if CONFIG_PAGING_LEVELS >= 4
-                                       + (i4 * L3_PAGETABLE_ENTRIES)
-#endif
-                                    )
-                           * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; 
-                    /* create a new 1le entry with the new type */
-                    flags = p2m_type_to_flags(nt, _mfn(mfn));
-                    l1e_content = l1e_from_pfn(mfn, flags);
-                    p2m->write_p2m_entry(p2m, gfn, &l1e[i1],
-                                         l1mfn, l1e_content, 1);
-                }
-                unmap_domain_page(l1e);
-            }
-            unmap_domain_page(l2e);
-        }
-#if CONFIG_PAGING_LEVELS >= 4
-        unmap_domain_page(l3e);
-    }
-#endif
-
-#if CONFIG_PAGING_LEVELS == 4
-    unmap_domain_page(l4e);
-#else /* CONFIG_PAGING_LEVELS == 3 */
-    unmap_domain_page(l3e);
-#endif
-
-}
 
 /* Modify the p2m type of a single gfn from ot to nt, returning the 
  * entry's previous type.  Resets the access permissions. */
diff -r 4b0692880dfa -r 26c4beb6b520 xen/include/asm-x86/p2m.h
--- a/xen/include/asm-x86/p2m.h Thu May 05 17:40:34 2011 +0100
+++ b/xen/include/asm-x86/p2m.h Fri May 06 11:15:35 2011 +0100
@@ -638,6 +638,34 @@ static inline void p2m_mem_access_check(
 struct page_info *p2m_alloc_ptp(struct p2m_domain *p2m, unsigned long type);
 void p2m_free_ptp(struct p2m_domain *p2m, struct page_info *pg);
 
+#if CONFIG_PAGING_LEVELS == 3
+static inline int p2m_gfn_check_limit(
+    struct domain *d, unsigned long gfn, unsigned int order)
+{
+    /*
+     * 32bit AMD nested paging does not support over 4GB guest due to 
+     * hardware translation limit. This limitation is checked by comparing
+     * gfn with 0xfffffUL.
+     */
+    if ( !hap_enabled(d) || ((gfn + (1ul << order)) <= 0x100000UL) ||
+         (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) )
+        return 0;
+
+    if ( !test_and_set_bool(d->arch.hvm_domain.svm.npt_4gb_warning) )
+        dprintk(XENLOG_WARNING, "Dom%d failed to populate memory beyond"
+                " 4GB: specify 'hap=0' domain config option.\n",
+                d->domain_id);
+
+    return -EINVAL;
+}
+#else
+#define p2m_gfn_check_limit(d, g, o) 0
+#endif
+
+/* Directly set a p2m entry: only for use by p2m code */
+int set_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, 
+                  unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma);
+
 #endif /* _XEN_P2M_H */
 
 /*

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
WARNING - OLD ARCHIVES

xen-devel

[Xen-devel] [PATCH] x86/mm/p2m: break into common, pt-implementation and