WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] [xen-unstable] Implement Nested-on-Nested.

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [xen-unstable] Implement Nested-on-Nested.
From: Xen patchbot-unstable <patchbot@xxxxxxx>
Date: Sat, 09 Apr 2011 09:20:16 +0100
Delivery-date: Sat, 09 Apr 2011 01:23:32 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User cegger
# Date 1302011049 -7200
# Node ID 7714b42e72fad771a447d66dc9e2acdd0dc98c59
# Parent  9c3fbfa7d0d5ce94c764e126f158c2b6fc78fb28
Implement Nested-on-Nested.
This allows the guest to run nested guest with hap enabled.

Signed-off-by: Christoph Egger <Christoph.Egger@xxxxxxx>
Acked-by: Tim Deegan <Tim.Deegan@xxxxxxxxxx>
Committed-by: Tim Deegan <Tim.Deegan@xxxxxxxxxx>
---


diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c    Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/arch/x86/hvm/hvm.c    Tue Apr 05 15:44:09 2011 +0200
@@ -1186,21 +1186,50 @@
     hvm_funcs.inject_exception(trapnr, errcode, cr2);
 }
 
-bool_t hvm_hap_nested_page_fault(unsigned long gpa,
-                                 bool_t gla_valid,
-                                 unsigned long gla,
-                                 bool_t access_valid,
-                                 bool_t access_r,
-                                 bool_t access_w,
-                                 bool_t access_x)
+int hvm_hap_nested_page_fault(unsigned long gpa,
+                              bool_t gla_valid,
+                              unsigned long gla,
+                              bool_t access_valid,
+                              bool_t access_r,
+                              bool_t access_w,
+                              bool_t access_x)
 {
     unsigned long gfn = gpa >> PAGE_SHIFT;
     p2m_type_t p2mt;
     p2m_access_t p2ma;
     mfn_t mfn;
     struct vcpu *v = current;
-    struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
-
+    struct p2m_domain *p2m = NULL;
+
+    /* On Nested Virtualization, walk the guest page table.
+     * If this succeeds, all is fine.
+     * If this fails, inject a nested page fault into the guest.
+     */
+    if ( nestedhvm_enabled(v->domain)
+        && nestedhvm_vcpu_in_guestmode(v)
+        && nestedhvm_paging_mode_hap(v) )
+    {
+        int rv;
+
+        /* The vcpu is in guest mode and the l1 guest
+         * uses hap. That means 'gpa' is in l2 guest
+         * physical address space.
+         * Fix the nested p2m or inject nested page fault
+         * into l1 guest if not fixable. The algorithm is
+         * the same as for shadow paging.
+         */
+        rv = nestedhvm_hap_nested_page_fault(v, gpa);
+        switch (rv) {
+        case NESTEDHVM_PAGEFAULT_DONE:
+            return 1;
+        case NESTEDHVM_PAGEFAULT_ERROR:
+            return 0;
+        case NESTEDHVM_PAGEFAULT_INJECT:
+            return -1;
+        }
+    }
+
+    p2m = p2m_get_hostp2m(v->domain);
     mfn = gfn_to_mfn_type_current(p2m, gfn, &p2mt, &p2ma, p2m_guest);
 
     /* Check access permissions first, then handle faults */
@@ -1344,6 +1373,15 @@
         return X86EMUL_EXCEPTION;
     }
 
+    if ( nestedhvm_enabled(v->domain) && cpu_has_svm &&
+       ((value & EFER_SVME) == 0 ) &&
+       ((value ^ v->arch.hvm_vcpu.guest_efer) & EFER_SVME) )
+    {
+        /* Cleared EFER.SVME: Flush all nestedp2m tables */
+        p2m_flush_nestedp2m(v->domain);
+        nestedhvm_vcpu_reset(v);
+    }
+
     value |= v->arch.hvm_vcpu.guest_efer & EFER_LMA;
     v->arch.hvm_vcpu.guest_efer = value;
     hvm_update_guest_efer(v);
@@ -1494,8 +1532,12 @@
     v->arch.hvm_vcpu.guest_cr[0] = value;
     hvm_update_guest_cr(v, 0);
 
-    if ( (value ^ old_value) & X86_CR0_PG )
-        paging_update_paging_modes(v);
+    if ( (value ^ old_value) & X86_CR0_PG ) {
+        if ( !nestedhvm_vmswitch_in_progress(v) && 
nestedhvm_vcpu_in_guestmode(v) )
+            paging_update_nestedmode(v);
+        else
+            paging_update_paging_modes(v);
+    }
 
     return X86EMUL_OKAY;
 
@@ -1562,8 +1604,12 @@
     hvm_update_guest_cr(v, 4);
 
     /* Modifying CR4.{PSE,PAE,PGE} invalidates all TLB entries, inc. Global. */
-    if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
-        paging_update_paging_modes(v);
+    if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) ) {
+        if ( !nestedhvm_vmswitch_in_progress(v) && 
nestedhvm_vcpu_in_guestmode(v) )
+            paging_update_nestedmode(v);
+        else
+            paging_update_paging_modes(v);
+    }
 
     return X86EMUL_OKAY;
 
@@ -2076,7 +2122,7 @@
     void *buf, paddr_t addr, int size, unsigned int flags, uint32_t pfec)
 {
     struct vcpu *curr = current;
-    struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain);
+    struct p2m_domain *p2m;
     unsigned long gfn, mfn;
     p2m_type_t p2mt;
     char *p;
@@ -2098,6 +2144,8 @@
         return HVMCOPY_unhandleable;
 #endif
 
+    p2m = p2m_get_hostp2m(curr->domain);
+
     while ( todo > 0 )
     {
         count = min_t(int, PAGE_SIZE - (addr & ~PAGE_MASK), todo);
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/arch/x86/hvm/nestedhvm.c
--- a/xen/arch/x86/hvm/nestedhvm.c      Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/arch/x86/hvm/nestedhvm.c      Tue Apr 05 15:44:09 2011 +0200
@@ -20,6 +20,7 @@
 #include <asm/msr.h>
 #include <asm/hvm/support.h>   /* for HVM_DELIVER_NO_ERROR_CODE */
 #include <asm/hvm/hvm.h>
+#include <asm/p2m.h>    /* for struct p2m_domain */
 #include <asm/hvm/nestedhvm.h>
 #include <asm/event.h>  /* for local_event_delivery_(en|dis)able */
 #include <asm/paging.h> /* for paging_mode_hap() */
@@ -96,6 +97,54 @@
     return nhvm_vcpu_destroy(v);
 }
 
+static void
+nestedhvm_flushtlb_ipi(void *info)
+{
+    struct vcpu *v = current;
+    struct domain *d = info;
+
+    ASSERT(d != NULL);
+    if (v->domain != d) {
+        /* This cpu doesn't belong to the domain */
+        return;
+    }
+
+    /* Just flush the ASID (or request a new one).
+     * This is cheaper than flush_tlb_local() and has
+     * the same desired effect.
+     */
+    hvm_asid_flush_core();
+    vcpu_nestedhvm(v).nv_p2m = NULL;
+}
+
+void
+nestedhvm_vmcx_flushtlb(struct p2m_domain *p2m)
+{
+    on_selected_cpus(&p2m->p2m_dirty_cpumask, nestedhvm_flushtlb_ipi,
+        p2m->domain, 1);
+    cpus_clear(p2m->p2m_dirty_cpumask);
+}
+
+void
+nestedhvm_vmcx_flushtlbdomain(struct domain *d)
+{
+    on_selected_cpus(d->domain_dirty_cpumask, nestedhvm_flushtlb_ipi, d, 1);
+}
+
+bool_t
+nestedhvm_is_n2(struct vcpu *v)
+{
+    if (!nestedhvm_enabled(v->domain)
+      || nestedhvm_vmswitch_in_progress(v)
+      || !nestedhvm_paging_mode_hap(v))
+        return 0;
+
+    if (nestedhvm_vcpu_in_guestmode(v))
+        return 1;
+
+    return 0;
+}
+
 /* Common shadow IO Permission bitmap */
 
 /* There four global patterns of io bitmap each guest can
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/arch/x86/hvm/svm/nestedsvm.c
--- a/xen/arch/x86/hvm/svm/nestedsvm.c  Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/arch/x86/hvm/svm/nestedsvm.c  Tue Apr 05 15:44:09 2011 +0200
@@ -26,6 +26,7 @@
 #include <asm/hvm/svm/svmdebug.h>
 #include <asm/paging.h> /* paging_mode_hap */
 #include <asm/event.h> /* for local_event_delivery_(en|dis)able */
+#include <asm/p2m.h> /* p2m_get_pagetable, p2m_get_nestedp2m */
 
 static void
 nestedsvm_vcpu_clgi(struct vcpu *v)
@@ -320,6 +321,18 @@
     return 0;
 }
 
+static void nestedsvm_vmcb_set_nestedp2m(struct vcpu *v,
+    struct vmcb_struct *vvmcb, struct vmcb_struct *n2vmcb)
+{
+    struct p2m_domain *p2m;
+
+    ASSERT(v != NULL);
+    ASSERT(vvmcb != NULL);
+    ASSERT(n2vmcb != NULL);
+    p2m = p2m_get_nestedp2m(v, vvmcb->_h_cr3);
+    n2vmcb->_h_cr3 = pagetable_get_paddr(p2m_get_pagetable(p2m));
+}
+
 static int nsvm_vmcb_prepare4vmrun(struct vcpu *v, struct cpu_user_regs *regs)
 {
     struct nestedvcpu *nv = &vcpu_nestedhvm(v);
@@ -475,6 +488,9 @@
     /* Nested paging mode */
     if (nestedhvm_paging_mode_hap(v)) {
         /* host nested paging + guest nested paging. */
+        n2vmcb->_np_enable = 1;
+
+        nestedsvm_vmcb_set_nestedp2m(v, ns_vmcb, n2vmcb);
 
         /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
         rc = hvm_set_cr3(ns_vmcb->_cr3);
@@ -1318,8 +1334,20 @@
         ret = nsvm_vcpu_vmrun(v, regs);
         if (ret < 0)
             goto vmexit;
+
+        ASSERT(nestedhvm_vcpu_in_guestmode(v));
         nv->nv_vmentry_pending = 0;
-        return;
+    }
+
+    if (nestedhvm_vcpu_in_guestmode(v)
+       && nestedhvm_paging_mode_hap(v))
+    {
+        /* In case left the l2 guest due to a physical interrupt (e.g. IPI)
+         * that is not for the l1 guest then we continue running the l2 guest
+         * but check if the nestedp2m is still valid.
+         */
+        if (nv->nv_p2m == NULL)
+            nestedsvm_vmcb_set_nestedp2m(v, nv->nv_vvmcx, nv->nv_n2vmcx);
     }
 }
 
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/arch/x86/hvm/svm/svm.c
--- a/xen/arch/x86/hvm/svm/svm.c        Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/arch/x86/hvm/svm/svm.c        Tue Apr 05 15:44:09 2011 +0200
@@ -1014,14 +1014,16 @@
     return &svm_function_table;
 }
 
-static void svm_do_nested_pgfault(paddr_t gpa)
+static void svm_do_nested_pgfault(struct vcpu *v,
+    struct cpu_user_regs *regs, paddr_t gpa)
 {
+    int ret;
     unsigned long gfn = gpa >> PAGE_SHIFT;
     mfn_t mfn;
     p2m_type_t p2mt;
-    struct p2m_domain *p2m;
+    struct p2m_domain *p2m = NULL;
 
-    p2m = p2m_get_hostp2m(current->domain);
+    ret = hvm_hap_nested_page_fault(gpa, 0, ~0ul, 0, 0, 0, 0);
 
     if ( tb_init_done )
     {
@@ -1032,6 +1034,7 @@
             uint32_t p2mt;
         } _d;
 
+        p2m = p2m_get_p2m(v);
         _d.gpa = gpa;
         _d.qualification = 0;
         _d.mfn = mfn_x(gfn_to_mfn_query(p2m, gfn, &_d.p2mt));
@@ -1039,14 +1042,26 @@
         __trace_var(TRC_HVM_NPF, 0, sizeof(_d), &_d);
     }
 
-    if ( hvm_hap_nested_page_fault(gpa, 0, ~0ul, 0, 0, 0, 0) )
+    switch (ret) {
+    case 0:
+        break;
+    case 1:
         return;
+    case -1:
+        ASSERT(nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v));
+        /* inject #VMEXIT(NPF) into guest. */
+        nestedsvm_vmexit_defer(v, VMEXIT_NPF, regs->error_code, gpa);
+        return;
+    }
 
+    if ( p2m == NULL )
+        p2m = p2m_get_p2m(v);
     /* Everything else is an error. */
     mfn = gfn_to_mfn_guest(p2m, gfn, &p2mt);
-    gdprintk(XENLOG_ERR, "SVM violation gpa %#"PRIpaddr", mfn %#lx, type %i\n",
-             gpa, mfn_x(mfn), p2mt);
-    domain_crash(current->domain);
+    gdprintk(XENLOG_ERR,
+         "SVM violation gpa %#"PRIpaddr", mfn %#lx, type %i\n",
+         gpa, mfn_x(mfn), p2mt);
+    domain_crash(v->domain);
 }
 
 static void svm_fpu_dirty_intercept(void)
@@ -1659,6 +1674,8 @@
         struct vmcb_struct *ns_vmcb = nv->nv_vvmcx;
         uint64_t exitinfo1, exitinfo2;
 
+        paging_update_nestedmode(v);
+
         /* Write real exitinfo1 back into virtual vmcb.
          * nestedsvm_check_intercepts() expects to have the correct
          * exitinfo1 value there.
@@ -1948,7 +1965,7 @@
     case VMEXIT_NPF:
         perfc_incra(svmexits, VMEXIT_NPF_PERFC);
         regs->error_code = vmcb->exitinfo1;
-        svm_do_nested_pgfault(vmcb->exitinfo2);
+        svm_do_nested_pgfault(v, regs, vmcb->exitinfo2);
         break;
 
     case VMEXIT_IRET: {
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/arch/x86/mm/hap/Makefile
--- a/xen/arch/x86/mm/hap/Makefile      Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/arch/x86/mm/hap/Makefile      Tue Apr 05 15:44:09 2011 +0200
@@ -3,6 +3,7 @@
 obj-y += guest_walk_3level.o
 obj-y += guest_walk_4level.o
 obj-y += p2m-ept.o
+obj-y += nested_hap.o
 
 guest_levels  = $(subst level,,$(filter %level,$(subst ., ,$(subst _, ,$(1)))))
 guest_walk_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1))
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/arch/x86/mm/hap/guest_walk.c
--- a/xen/arch/x86/mm/hap/guest_walk.c  Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/arch/x86/mm/hap/guest_walk.c  Tue Apr 05 15:44:09 2011 +0200
@@ -29,24 +29,32 @@
 #define _hap_gva_to_gfn(levels) hap_gva_to_gfn_##levels##_levels
 #define hap_gva_to_gfn(levels) _hap_gva_to_gfn(levels)
 
+#define _hap_p2m_ga_to_gfn(levels) hap_p2m_ga_to_gfn_##levels##_levels
+#define hap_p2m_ga_to_gfn(levels) _hap_p2m_ga_to_gfn(levels)
+
 #if GUEST_PAGING_LEVELS <= CONFIG_PAGING_LEVELS
 
 #include <asm/guest_pt.h>
 #include <asm/p2m.h>
 
 unsigned long hap_gva_to_gfn(GUEST_PAGING_LEVELS)(
-    struct vcpu *v, unsigned long gva, uint32_t *pfec)
+    struct vcpu *v, struct p2m_domain *p2m, unsigned long gva, uint32_t *pfec)
 {
-    unsigned long cr3;
+    unsigned long cr3 = v->arch.hvm_vcpu.guest_cr[3];
+    return hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(v, p2m, cr3, gva, pfec);
+}
+
+unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
+    struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3,
+    paddr_t ga, uint32_t *pfec)
+{
     uint32_t missing;
     mfn_t top_mfn;
     void *top_map;
     p2m_type_t p2mt;
     walk_t gw;
-    struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
 
     /* Get the top-level table's MFN */
-    cr3 = v->arch.hvm_vcpu.guest_cr[3];
     top_mfn = gfn_to_mfn_unshare(p2m, cr3 >> PAGE_SHIFT, &p2mt, 0);
     if ( p2m_is_paging(p2mt) )
     {
@@ -72,7 +80,7 @@
 #if GUEST_PAGING_LEVELS == 3
     top_map += (cr3 & ~(PAGE_MASK | 31));
 #endif
-    missing = guest_walk_tables(v, p2m, gva, &gw, pfec[0], top_mfn, top_map);
+    missing = guest_walk_tables(v, p2m, ga, &gw, pfec[0], top_mfn, top_map);
     unmap_domain_page(top_map);
 
     /* Interpret the answer */
@@ -122,6 +130,15 @@
     return INVALID_GFN;
 }
 
+unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
+    struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3,
+    paddr_t ga, uint32_t *pfec)
+{
+    gdprintk(XENLOG_ERR,
+             "Guest paging level is greater than host paging level!\n");
+    domain_crash(v->domain);
+    return INVALID_GFN;
+}
 #endif
 
 
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/arch/x86/mm/hap/hap.c
--- a/xen/arch/x86/mm/hap/hap.c Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/arch/x86/mm/hap/hap.c Tue Apr 05 15:44:09 2011 +0200
@@ -40,6 +40,7 @@
 #include <asm/p2m.h>
 #include <asm/domain.h>
 #include <xen/numa.h>
+#include <asm/hvm/nestedhvm.h>
 
 #include "private.h"
 
@@ -582,6 +583,7 @@
 int hap_enable(struct domain *d, u32 mode)
 {
     unsigned int old_pages;
+    uint8_t i;
     int rv = 0;
 
     domain_pause(d);
@@ -620,6 +622,12 @@
             goto out;
     }
 
+    for (i = 0; i < MAX_NESTEDP2M; i++) {
+        rv = p2m_alloc_table(d->arch.nested_p2m[i]);
+        if ( rv != 0 )
+           goto out;
+    }
+
     /* Now let other users see the new mode */
     d->arch.paging.mode = mode | PG_HAP_enable;
 
@@ -630,6 +638,13 @@
 
 void hap_final_teardown(struct domain *d)
 {
+    uint8_t i;
+
+    /* Destroy nestedp2m's first */
+    for (i = 0; i < MAX_NESTEDP2M; i++) {
+        p2m_teardown(d->arch.nested_p2m[i]);
+    }
+
     if ( d->arch.paging.hap.total_pages != 0 )
         hap_teardown(d);
 
@@ -657,7 +672,7 @@
         /* release the monitor table held by each vcpu */
         for_each_vcpu ( d, v )
         {
-            if ( v->arch.paging.mode && paging_mode_external(d) )
+            if ( paging_get_hostmode(v) && paging_mode_external(d) )
             {
                 mfn = pagetable_get_mfn(v->arch.monitor_table);
                 if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) )
@@ -725,6 +740,7 @@
 void hap_vcpu_init(struct vcpu *v)
 {
     v->arch.paging.mode = &hap_paging_real_mode;
+    v->arch.paging.nestedmode = &hap_paging_real_mode;
 }
 
 /************************************************/
@@ -751,6 +767,15 @@
  */
 static int hap_invlpg(struct vcpu *v, unsigned long va)
 {
+    if (nestedhvm_enabled(v->domain)) {
+        /* Emulate INVLPGA:
+         * Must perform the flush right now or an other vcpu may
+         * use it when we use the next VMRUN emulation, otherwise.
+         */
+        p2m_flush(v, vcpu_nestedhvm(v).nv_p2m);
+        return 1;
+    }
+
     HAP_ERROR("Intercepted a guest INVLPG (%u:%u) with HAP enabled.\n",
               v->domain->domain_id, v->vcpu_id);
     domain_crash(v->domain);
@@ -763,17 +788,22 @@
     hvm_update_guest_cr(v, 3);
 }
 
+const struct paging_mode *
+hap_paging_get_mode(struct vcpu *v)
+{
+    return !hvm_paging_enabled(v)   ? &hap_paging_real_mode :
+        hvm_long_mode_enabled(v) ? &hap_paging_long_mode :
+        hvm_pae_enabled(v)       ? &hap_paging_pae_mode  :
+                                   &hap_paging_protected_mode;
+}
+
 static void hap_update_paging_modes(struct vcpu *v)
 {
     struct domain *d = v->domain;
 
     hap_lock(d);
 
-    v->arch.paging.mode =
-        !hvm_paging_enabled(v)   ? &hap_paging_real_mode :
-        hvm_long_mode_enabled(v) ? &hap_paging_long_mode :
-        hvm_pae_enabled(v)       ? &hap_paging_pae_mode  :
-                                   &hap_paging_protected_mode;
+    v->arch.paging.mode = hap_paging_get_mode(v);
 
     if ( pagetable_is_null(v->arch.monitor_table) )
     {
@@ -834,38 +864,65 @@
 hap_write_p2m_entry(struct vcpu *v, unsigned long gfn, l1_pgentry_t *p,
                     mfn_t table_mfn, l1_pgentry_t new, unsigned int level)
 {
+    struct domain *d = v->domain;
     uint32_t old_flags;
+    bool_t flush_nestedp2m = 0;
 
-    hap_lock(v->domain);
+    /* We know always use the host p2m here, regardless if the vcpu
+     * is in host or guest mode. The vcpu can be in guest mode by
+     * a hypercall which passes a domain and chooses mostly the first
+     * vcpu. */
 
+    hap_lock(d);
     old_flags = l1e_get_flags(*p);
+
+    if ( nestedhvm_enabled(d) && (old_flags & _PAGE_PRESENT) ) {
+        /* We are replacing a valid entry so we need to flush nested p2ms,
+         * unless the only change is an increase in access rights. */
+        mfn_t omfn = _mfn(l1e_get_pfn(*p));
+        mfn_t nmfn = _mfn(l1e_get_pfn(new));
+        flush_nestedp2m = !( mfn_x(omfn) == mfn_x(nmfn)
+            && perms_strictly_increased(old_flags, l1e_get_flags(new)) );
+    }
+
     safe_write_pte(p, new);
     if ( (old_flags & _PAGE_PRESENT)
          && (level == 1 || (level == 2 && (old_flags & _PAGE_PSE))) )
-             flush_tlb_mask(v->domain->domain_dirty_cpumask);
+             flush_tlb_mask(d->domain_dirty_cpumask);
 
 #if CONFIG_PAGING_LEVELS == 3
     /* install P2M in monitor table for PAE Xen */
     if ( level == 3 )
         /* We have written to the p2m l3: need to sync the per-vcpu
          * copies of it in the monitor tables */
-        p2m_install_entry_in_monitors(v->domain, (l3_pgentry_t *)p);
+        p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p);
 #endif
 
-    hap_unlock(v->domain);
+    hap_unlock(d);
+
+    if ( flush_nestedp2m )
+        p2m_flush_nestedp2m(d);
 }
 
 static unsigned long hap_gva_to_gfn_real_mode(
-    struct vcpu *v, unsigned long gva, uint32_t *pfec)
+    struct vcpu *v, struct p2m_domain *p2m, unsigned long gva, uint32_t *pfec)
 {
     return ((paddr_t)gva >> PAGE_SHIFT);
 }
 
+static unsigned long hap_p2m_ga_to_gfn_real_mode(
+    struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3,
+    paddr_t ga, uint32_t *pfec)
+{
+    return (ga >> PAGE_SHIFT);
+}
+
 /* Entry points into this mode of the hap code. */
 static const struct paging_mode hap_paging_real_mode = {
     .page_fault             = hap_page_fault,
     .invlpg                 = hap_invlpg,
     .gva_to_gfn             = hap_gva_to_gfn_real_mode,
+    .p2m_ga_to_gfn          = hap_p2m_ga_to_gfn_real_mode,
     .update_cr3             = hap_update_cr3,
     .update_paging_modes    = hap_update_paging_modes,
     .write_p2m_entry        = hap_write_p2m_entry,
@@ -876,6 +933,7 @@
     .page_fault             = hap_page_fault,
     .invlpg                 = hap_invlpg,
     .gva_to_gfn             = hap_gva_to_gfn_2_levels,
+    .p2m_ga_to_gfn          = hap_p2m_ga_to_gfn_2_levels,
     .update_cr3             = hap_update_cr3,
     .update_paging_modes    = hap_update_paging_modes,
     .write_p2m_entry        = hap_write_p2m_entry,
@@ -886,6 +944,7 @@
     .page_fault             = hap_page_fault,
     .invlpg                 = hap_invlpg,
     .gva_to_gfn             = hap_gva_to_gfn_3_levels,
+    .p2m_ga_to_gfn          = hap_p2m_ga_to_gfn_3_levels,
     .update_cr3             = hap_update_cr3,
     .update_paging_modes    = hap_update_paging_modes,
     .write_p2m_entry        = hap_write_p2m_entry,
@@ -896,6 +955,7 @@
     .page_fault             = hap_page_fault,
     .invlpg                 = hap_invlpg,
     .gva_to_gfn             = hap_gva_to_gfn_4_levels,
+    .p2m_ga_to_gfn          = hap_p2m_ga_to_gfn_4_levels,
     .update_cr3             = hap_update_cr3,
     .update_paging_modes    = hap_update_paging_modes,
     .write_p2m_entry        = hap_write_p2m_entry,
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/arch/x86/mm/hap/nested_hap.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/mm/hap/nested_hap.c  Tue Apr 05 15:44:09 2011 +0200
@@ -0,0 +1,236 @@
+/******************************************************************************
+ * arch/x86/mm/hap/nested_hap.c
+ *
+ * Code for Nested Virtualization
+ * Copyright (c) 2011 Advanced Micro Devices
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <asm/domain.h>
+#include <asm/page.h>
+#include <asm/paging.h>
+#include <asm/p2m.h>
+#include <asm/mem_event.h>
+#include <public/mem_event.h>
+#include <asm/mem_sharing.h>
+#include <xen/event.h>
+#include <asm/hap.h>
+#include <asm/hvm/support.h>
+
+#include <asm/hvm/nestedhvm.h>
+
+#include "private.h"
+
+/* AlGORITHM for NESTED PAGE FAULT 
+ * 
+ * NOTATION
+ * Levels: L0, L1, L2
+ * Guests: L1 guest, L2 guest
+ * Hypervisor: L0 hypervisor
+ * Addresses: L2-GVA, L2-GPA, L1-GVA, L1-GPA, MPA
+ *
+ * On L0, when #NPF happens, the handler function should do:
+ * hap_page_fault(GPA)
+ * {
+ *    1. If #NPF is from L1 guest, then we crash the guest VM (same as old 
+ *       code)
+ *    2. If #NPF is from L2 guest, then we continue from (3)
+ *    3. Get h_cr3 from L1 guest. Map h_cr3 into L0 hypervisor address space.
+ *    4. Walk the h_cr3 page table
+ *    5.    - if not present, then we inject #NPF back to L1 guest and 
+ *            re-launch L1 guest (L1 guest will either treat this #NPF as MMIO,
+ *            or fix its p2m table for L2 guest)
+ *    6.    - if present, then we will get the a new translated value L1-GPA 
+ *            (points to L1 machine memory)
+ *    7.        * Use L1-GPA to walk L0 P2M table
+ *    8.            - if not present, then crash the guest (should not happen)
+ *    9.            - if present, then we get a new translated value MPA 
+ *                    (points to real machine memory)
+ *   10.                * Finally, use GPA and MPA to walk nested_p2m 
+ *                        and fix the bits.
+ * }
+ * 
+ */
+
+
+/********************************************/
+/*        NESTED VIRT P2M FUNCTIONS         */
+/********************************************/
+/* Override macros from asm/page.h to make them work with mfn_t */
+#undef mfn_valid
+#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
+#undef page_to_mfn
+#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
+
+void
+nestedp2m_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
+    l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level)
+{
+    struct domain *d = p2m->domain;
+    uint32_t old_flags;
+
+    hap_lock(d);
+
+    old_flags = l1e_get_flags(*p);
+    safe_write_pte(p, new);
+    if (old_flags & _PAGE_PRESENT)
+        nestedhvm_vmcx_flushtlb(p2m);
+    
+    hap_unlock(d);
+}
+
+/********************************************/
+/*          NESTED VIRT FUNCTIONS           */
+/********************************************/
+static void
+nestedhap_fix_p2m(struct p2m_domain *p2m, paddr_t L2_gpa, paddr_t L0_gpa,
+    p2m_type_t p2mt, p2m_access_t p2ma)
+{
+    int rv;
+    ASSERT(p2m);
+    ASSERT(p2m->set_entry);
+
+    rv = p2m->set_entry(p2m, L2_gpa >> PAGE_SHIFT,
+                         page_to_mfn(maddr_to_page(L0_gpa)),
+                         0 /*4K*/, p2mt, p2ma);
+    if (rv == 0) {
+        gdprintk(XENLOG_ERR,
+               "failed to set entry for 0x%"PRIx64" -> 0x%"PRIx64"\n",
+               L2_gpa, L0_gpa);
+        BUG();
+    }
+}
+
+/* This function uses L1_gpa to walk the P2M table in L0 hypervisor. If the
+ * walk is successful, the translated value is returned in L0_gpa. The return 
+ * value tells the upper level what to do.
+ */
+static int
+nestedhap_walk_L0_p2m(struct p2m_domain *p2m, paddr_t L1_gpa, paddr_t *L0_gpa)
+{
+    mfn_t mfn;
+    p2m_type_t p2mt;
+
+    /* we use gfn_to_mfn_query() function to walk L0 P2M table */
+    mfn = gfn_to_mfn_query(p2m, L1_gpa >> PAGE_SHIFT, &p2mt);
+
+    if ( p2m_is_paging(p2mt) || p2m_is_shared(p2mt) || !p2m_is_ram(p2mt) )
+        return NESTEDHVM_PAGEFAULT_ERROR;
+
+    if ( !mfn_valid(mfn) )
+        return NESTEDHVM_PAGEFAULT_ERROR;
+
+    *L0_gpa = (mfn_x(mfn) << PAGE_SHIFT) + (L1_gpa & ~PAGE_MASK);
+    return NESTEDHVM_PAGEFAULT_DONE;
+}
+
+/* This function uses L2_gpa to walk the P2M page table in L1. If the 
+ * walk is successful, the translated value is returned in
+ * L1_gpa. The result value tells what to do next.
+ */
+static int
+nestedhap_walk_L1_p2m(struct vcpu *v, struct p2m_domain *p2m,
+    paddr_t L2_gpa, paddr_t *L1_gpa)
+{
+    uint32_t pfec;
+    unsigned long nested_cr3, gfn;
+    const struct paging_mode *mode = paging_get_hostmode(v);
+    
+    nested_cr3 = nhvm_vcpu_hostcr3(v);
+
+    /* walk the guest table */
+    gfn = paging_p2m_ga_to_gfn(v, p2m, mode, nested_cr3, L2_gpa, &pfec);
+
+    if ( gfn == INVALID_GFN ) 
+        return NESTEDHVM_PAGEFAULT_INJECT;
+
+    *L1_gpa = (gfn << PAGE_SHIFT) + (L2_gpa & ~PAGE_MASK);
+    return NESTEDHVM_PAGEFAULT_DONE;
+}
+
+/*
+ * The following function, nestedhap_page_fault(), is for steps (3)--(10).
+ *
+ * Returns:
+ */
+int
+nestedhvm_hap_nested_page_fault(struct vcpu *v, paddr_t L2_gpa)
+{
+    int rv;
+    paddr_t L1_gpa, L0_gpa;
+    struct domain *d = v->domain;
+    struct p2m_domain *p2m, *nested_p2m;
+
+    p2m = p2m_get_hostp2m(d); /* L0 p2m */
+    nested_p2m = p2m_get_nestedp2m(v, nhvm_vcpu_hostcr3(v));
+
+    /* walk the L1 P2M table, note we have to pass p2m
+     * and not nested_p2m here or we fail the walk forever,
+     * otherwise. */
+    rv = nestedhap_walk_L1_p2m(v, p2m, L2_gpa, &L1_gpa);
+
+    /* let caller to handle these two cases */
+    switch (rv) {
+    case NESTEDHVM_PAGEFAULT_INJECT:
+        return rv;
+    case NESTEDHVM_PAGEFAULT_ERROR:
+        return rv;
+    case NESTEDHVM_PAGEFAULT_DONE:
+        break;
+    default:
+        BUG();
+        break;
+    }
+
+    /* ==> we have to walk L0 P2M */
+    rv = nestedhap_walk_L0_p2m(p2m, L1_gpa, &L0_gpa);
+
+    /* let upper level caller to handle these two cases */
+    switch (rv) {
+    case NESTEDHVM_PAGEFAULT_INJECT:
+        return rv;
+    case NESTEDHVM_PAGEFAULT_ERROR:
+        return rv;
+    case NESTEDHVM_PAGEFAULT_DONE:
+        break;
+    default:
+        BUG();
+        break;
+    }
+
+    nestedp2m_lock(d);
+    /* fix p2m_get_pagetable(nested_p2m) */
+    nestedhap_fix_p2m(nested_p2m, L2_gpa, L0_gpa,
+        p2m_ram_rw,
+        p2m_access_rwx /* FIXME: Should use same permission as l1 guest */);
+    nestedp2m_unlock(d);
+
+    return NESTEDHVM_PAGEFAULT_DONE;
+}
+
+/********************************************/
+/*     NESTED VIRT INITIALIZATION FUNCS     */
+/********************************************/
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/arch/x86/mm/hap/private.h
--- a/xen/arch/x86/mm/hap/private.h     Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/arch/x86/mm/hap/private.h     Tue Apr 05 15:44:09 2011 +0200
@@ -23,11 +23,27 @@
 /********************************************/
 /*          GUEST TRANSLATION FUNCS         */
 /********************************************/
-unsigned long hap_gva_to_gfn_2_levels(struct vcpu *v, unsigned long gva, 
+unsigned long hap_gva_to_gfn_2_levels(struct vcpu *v,
+                                     struct p2m_domain *p2m,
+                                     unsigned long gva, 
                                      uint32_t *pfec);
-unsigned long hap_gva_to_gfn_3_levels(struct vcpu *v, unsigned long gva,
+unsigned long hap_gva_to_gfn_3_levels(struct vcpu *v,
+                                     struct p2m_domain *p2m,
+                                     unsigned long gva, 
                                      uint32_t *pfec);
-unsigned long hap_gva_to_gfn_4_levels(struct vcpu *v, unsigned long gva,
+unsigned long hap_gva_to_gfn_4_levels(struct vcpu *v,
+                                     struct p2m_domain *p2m,
+                                     unsigned long gva, 
                                      uint32_t *pfec);
 
+unsigned long hap_p2m_ga_to_gfn_2_levels(struct vcpu *v,
+    struct p2m_domain *p2m, unsigned long cr3,
+    paddr_t ga, uint32_t *pfec);
+unsigned long hap_p2m_ga_to_gfn_3_levels(struct vcpu *v,
+    struct p2m_domain *p2m, unsigned long cr3,
+    paddr_t ga, uint32_t *pfec);
+unsigned long hap_p2m_ga_to_gfn_4_levels(struct vcpu *v,
+    struct p2m_domain *p2m, unsigned long cr3,
+    paddr_t ga, uint32_t *pfec);
+
 #endif /* __HAP_PRIVATE_H__ */
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c     Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/arch/x86/mm/p2m.c     Tue Apr 05 15:44:09 2011 +0200
@@ -34,6 +34,7 @@
 #include <public/mem_event.h>
 #include <asm/mem_sharing.h>
 #include <xen/event.h>
+#include <asm/hvm/nestedhvm.h>
 
 /* Debugging and auditing of the P2M code? */
 #define P2M_AUDIT     0
@@ -75,7 +76,7 @@
 #define SUPERPAGE_PAGES (1UL << 9)
 #define superpage_aligned(_x)  (((_x)&(SUPERPAGE_PAGES-1))==0)
 
-static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
+unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
 {
     unsigned long flags;
 #ifdef __x86_64__
@@ -121,9 +122,9 @@
 // Find the next level's P2M entry, checking for out-of-range gfn's...
 // Returns NULL on error.
 //
-static l1_pgentry_t *
+l1_pgentry_t *
 p2m_find_entry(void *table, unsigned long *gfn_remainder,
-                   unsigned long gfn, u32 shift, u32 max)
+                   unsigned long gfn, uint32_t shift, uint32_t max)
 {
     u32 index;
 
@@ -224,20 +225,17 @@
 
         switch ( type ) {
         case PGT_l3_page_table:
-            paging_write_p2m_entry(p2m->domain, gfn,
-                                   p2m_entry, *table_mfn, new_entry, 4);
+            p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 
4);
             break;
         case PGT_l2_page_table:
 #if CONFIG_PAGING_LEVELS == 3
             /* for PAE mode, PDPE only has PCD/PWT/P bits available */
             new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), _PAGE_PRESENT);
 #endif
-            paging_write_p2m_entry(p2m->domain, gfn,
-                                   p2m_entry, *table_mfn, new_entry, 3);
+            p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 
3);
             break;
         case PGT_l1_page_table:
-            paging_write_p2m_entry(p2m->domain, gfn,
-                                   p2m_entry, *table_mfn, new_entry, 2);
+            p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 
2);
             break;
         default:
             BUG();
@@ -264,14 +262,13 @@
         for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
         {
             new_entry = l1e_from_pfn(pfn + (i * L1_PAGETABLE_ENTRIES), flags);
-            paging_write_p2m_entry(p2m->domain, gfn,
-                                   l1_entry+i, *table_mfn, new_entry, 2);
+            p2m->write_p2m_entry(p2m, gfn,
+                l1_entry+i, *table_mfn, new_entry, 2);
         }
         unmap_domain_page(l1_entry);
         new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
                                  __PAGE_HYPERVISOR|_PAGE_USER); //disable PSE
-        paging_write_p2m_entry(p2m->domain, gfn,
-                               p2m_entry, *table_mfn, new_entry, 3);
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3);
     }
 
 
@@ -298,15 +295,15 @@
         for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
         {
             new_entry = l1e_from_pfn(pfn + i, flags);
-            paging_write_p2m_entry(p2m->domain, gfn,
-                                   l1_entry+i, *table_mfn, new_entry, 1);
+            p2m->write_p2m_entry(p2m, gfn,
+                l1_entry+i, *table_mfn, new_entry, 1);
         }
         unmap_domain_page(l1_entry);
         
         new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
                                  __PAGE_HYPERVISOR|_PAGE_USER);
-        paging_write_p2m_entry(p2m->domain, gfn,
-                               p2m_entry, *table_mfn, new_entry, 2);
+        p2m->write_p2m_entry(p2m, gfn,
+            p2m_entry, *table_mfn, new_entry, 2);
     }
 
     *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
@@ -1369,8 +1366,7 @@
                            p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE)
             : l3e_empty();
         entry_content.l1 = l3e_content.l3;
-        paging_write_p2m_entry(p2m->domain, gfn, p2m_entry,
-                               table_mfn, entry_content, 3);
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 3);
         /* NB: paging_write_p2m_entry() handles tlb flushes properly */
 
         /* Free old intermediate tables if necessary */
@@ -1410,8 +1406,7 @@
             entry_content = l1e_empty();
         
         /* level 1 entry */
-        paging_write_p2m_entry(p2m->domain, gfn, p2m_entry,
-                               table_mfn, entry_content, 1);
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 1);
         /* NB: paging_write_p2m_entry() handles tlb flushes properly */
     }
     else if ( page_order == 9 )
@@ -1440,8 +1435,7 @@
             l2e_content = l2e_empty();
         
         entry_content.l1 = l2e_content.l2;
-        paging_write_p2m_entry(p2m->domain, gfn, p2m_entry,
-                               table_mfn, entry_content, 2);
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 2);
         /* NB: paging_write_p2m_entry() handles tlb flushes properly */
 
         /* Free old intermediate tables if necessary */
@@ -1806,10 +1800,13 @@
     p2m->domain = d;
     p2m->default_access = p2m_access_rwx;
 
+    p2m->cr3 = CR3_EADDR;
     p2m->set_entry = p2m_set_entry;
     p2m->get_entry = p2m_gfn_to_mfn;
     p2m->get_entry_current = p2m_gfn_to_mfn_current;
     p2m->change_entry_type_global = p2m_change_type_global;
+    p2m->write_p2m_entry = paging_write_p2m_entry;
+    cpus_clear(p2m->p2m_dirty_cpumask);
 
     if ( hap_enabled(d) && (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) )
         ept_p2m_init(d);
@@ -1817,6 +1814,25 @@
     return;
 }
 
+static int
+p2m_init_nestedp2m(struct domain *d)
+{
+    uint8_t i;
+    struct p2m_domain *p2m;
+
+    nestedp2m_lock_init(d);
+    for (i = 0; i < MAX_NESTEDP2M; i++) {
+        d->arch.nested_p2m[i] = p2m = xmalloc(struct p2m_domain);
+        if (p2m == NULL)
+            return -ENOMEM;
+        p2m_initialise(d, p2m);
+        p2m->get_entry_current = p2m->get_entry;
+        p2m->write_p2m_entry = nestedp2m_write_p2m_entry;
+    }
+
+    return 0;
+}
+
 int p2m_init(struct domain *d)
 {
     struct p2m_domain *p2m;
@@ -1825,8 +1841,12 @@
     if ( p2m == NULL )
         return -ENOMEM;
     p2m_initialise(d, p2m);
-    
-    return 0;
+
+    /* Must initialise nestedp2m unconditionally
+     * since nestedhvm_enabled(d) returns false here.
+     * (p2m_init runs too early for HVM_PARAM_* options)
+     */
+    return p2m_init_nestedp2m(d);
 }
 
 void p2m_change_entry_type_global(struct p2m_domain *p2m,
@@ -1919,6 +1939,9 @@
                         p2m_invalid, p2m->default_access) )
         goto error;
 
+    if (p2m_is_nestedp2m(p2m))
+        goto nesteddone;
+
     /* Copy all existing mappings from the page list and m2p */
     spin_lock(&p2m->domain->page_alloc_lock);
     page_list_for_each(page, &p2m->domain->page_list)
@@ -1940,6 +1963,7 @@
     }
     spin_unlock(&p2m->domain->page_alloc_lock);
 
+ nesteddone:
     P2M_PRINTK("p2m table initialised (%u pages)\n", page_count);
     p2m_unlock(p2m);
     return 0;
@@ -1966,6 +1990,9 @@
     mfn_t mfn;
 #endif
 
+    if (p2m == NULL)
+        return;
+
     p2m_lock(p2m);
 
 #ifdef __x86_64__
@@ -1984,11 +2011,26 @@
     p2m_unlock(p2m);
 }
 
+static void p2m_teardown_nestedp2m(struct domain *d)
+{
+    uint8_t i;
+
+    for (i = 0; i < MAX_NESTEDP2M; i++) {
+        xfree(d->arch.nested_p2m[i]);
+        d->arch.nested_p2m[i] = NULL;
+    }
+}
+
 void p2m_final_teardown(struct domain *d)
 {
     /* Iterate over all p2m tables per domain */
     xfree(d->arch.p2m);
     d->arch.p2m = NULL;
+
+    /* We must teardown unconditionally because
+     * we initialise them unconditionally.
+     */
+    p2m_teardown_nestedp2m(d);
 }
 
 #if P2M_AUDIT
@@ -2573,9 +2615,9 @@
                 gfn = get_gpfn_from_mfn(mfn);
                 flags = p2m_type_to_flags(nt, _mfn(mfn));
                 l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
-                paging_write_p2m_entry(p2m->domain, gfn,
-                                       (l1_pgentry_t *)&l3e[i3],
-                                       l3mfn, l1e_content, 3);
+                p2m->write_p2m_entry(p2m, gfn,
+                                     (l1_pgentry_t *)&l3e[i3],
+                                     l3mfn, l1e_content, 3);
                 continue;
             }
 
@@ -2604,9 +2646,9 @@
                            * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; 
                     flags = p2m_type_to_flags(nt, _mfn(mfn));
                     l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
-                    paging_write_p2m_entry(p2m->domain, gfn,
-                                           (l1_pgentry_t *)&l2e[i2],
-                                           l2mfn, l1e_content, 2);
+                    p2m->write_p2m_entry(p2m, gfn,
+                                         (l1_pgentry_t *)&l2e[i2],
+                                         l2mfn, l1e_content, 2);
                     continue;
                 }
 
@@ -2628,8 +2670,8 @@
                     /* create a new 1le entry with the new type */
                     flags = p2m_type_to_flags(nt, _mfn(mfn));
                     l1e_content = l1e_from_pfn(mfn, flags);
-                    paging_write_p2m_entry(p2m->domain, gfn, &l1e[i1],
-                                           l1mfn, l1e_content, 1);
+                    p2m->write_p2m_entry(p2m, gfn, &l1e[i1],
+                                         l1mfn, l1e_content, 1);
                 }
                 unmap_domain_page(l1e);
             }
@@ -3048,6 +3090,182 @@
 }
 #endif /* __x86_64__ */
 
+static struct p2m_domain *
+p2m_getlru_nestedp2m(struct domain *d, struct p2m_domain *p2m)
+{
+    int i, lru_index = -1;
+    struct p2m_domain *lrup2m, *tmp;
+
+    if (p2m == NULL) {
+        lru_index = MAX_NESTEDP2M - 1;
+        lrup2m = d->arch.nested_p2m[lru_index];
+    } else {
+        lrup2m = p2m;
+        for (i = 0; i < MAX_NESTEDP2M; i++) {
+            if (d->arch.nested_p2m[i] == p2m) {
+                lru_index = i;
+                break;
+            }
+        }
+    }
+
+    ASSERT(lru_index >= 0);
+    if (lru_index == 0) {
+        return lrup2m;
+    }
+
+    /* move the other's down the array "list" */
+    for (i = lru_index - 1; i >= 0; i--) {
+        tmp = d->arch.nested_p2m[i];
+        d->arch.nested_p2m[i+1] = tmp;        
+    }
+
+    /* make the entry the first one */
+    d->arch.nested_p2m[0] = lrup2m;
+
+    return lrup2m;
+}
+
+static int 
+p2m_flush_locked(struct p2m_domain *p2m)
+{
+    ASSERT(p2m);
+    if (p2m->cr3 == CR3_EADDR)
+        /* Microoptimisation: p2m is already empty.
+         * => about 0.3% speedup of overall system performance.
+         */
+        return 0;
+
+    p2m_teardown(p2m);
+    p2m_initialise(p2m->domain, p2m);
+    p2m->get_entry_current = p2m->get_entry;
+    p2m->write_p2m_entry = nestedp2m_write_p2m_entry;
+    return p2m_alloc_table(p2m);
+}
+
+void
+p2m_flush(struct vcpu *v, struct p2m_domain *p2m)
+{
+    struct domain *d = p2m->domain;
+
+    ASSERT(v->domain == d);
+    vcpu_nestedhvm(v).nv_p2m = NULL;
+    nestedp2m_lock(d);
+    BUG_ON(p2m_flush_locked(p2m) != 0);
+    hvm_asid_flush_vcpu(v);
+    nestedhvm_vmcx_flushtlb(p2m);
+    nestedp2m_unlock(d);
+}
+
+void
+p2m_flush_nestedp2m(struct domain *d)
+{
+    int i;
+
+    nestedp2m_lock(d);
+    for (i = 0; i < MAX_NESTEDP2M; i++) {
+        struct p2m_domain *p2m = d->arch.nested_p2m[i];
+        BUG_ON(p2m_flush_locked(p2m) != 0);
+        cpus_clear(p2m->p2m_dirty_cpumask);
+    }
+    nestedhvm_vmcx_flushtlbdomain(d);
+    nestedp2m_unlock(d);
+}
+
+struct p2m_domain *
+p2m_get_nestedp2m(struct vcpu *v, uint64_t cr3)
+{
+    /* Use volatile to prevent gcc to cache nv->nv_p2m in a cpu register as
+     * this may change within the loop by an other (v)cpu.
+     */
+    volatile struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+    struct domain *d;
+    struct p2m_domain *p2m;
+    int i, rv;
+
+    if (cr3 == 0 || cr3 == CR3_EADDR)
+        cr3 = v->arch.hvm_vcpu.guest_cr[3];
+
+    if (nv->nv_flushp2m && nv->nv_p2m) {
+        nv->nv_p2m = NULL;
+    }
+
+    d = v->domain;
+    nestedp2m_lock(d);
+    for (i = 0; i < MAX_NESTEDP2M; i++) {
+        p2m = d->arch.nested_p2m[i];
+        if ((p2m->cr3 != cr3 && p2m->cr3 != CR3_EADDR) || (p2m != nv->nv_p2m))
+            continue;
+
+        nv->nv_flushp2m = 0;
+        p2m_getlru_nestedp2m(d, p2m);
+        nv->nv_p2m = p2m;
+        if (p2m->cr3 == CR3_EADDR)
+            hvm_asid_flush_vcpu(v);
+        p2m->cr3 = cr3;
+        cpu_set(v->processor, p2m->p2m_dirty_cpumask);
+        nestedp2m_unlock(d);
+        return p2m;
+    }
+
+    /* All p2m's are or were in use. Take the least recent used one,
+     * flush it and reuse.
+     */
+    for (i = 0; i < MAX_NESTEDP2M; i++) {
+        p2m = p2m_getlru_nestedp2m(d, NULL);
+        rv = p2m_flush_locked(p2m);
+        if (rv == 0)
+            break;
+    }
+    nv->nv_p2m = p2m;
+    p2m->cr3 = cr3;
+    nv->nv_flushp2m = 0;
+    hvm_asid_flush_vcpu(v);
+    nestedhvm_vmcx_flushtlb(nv->nv_p2m);
+    cpu_set(v->processor, p2m->p2m_dirty_cpumask);
+    nestedp2m_unlock(d);
+
+    return p2m;
+}
+
+struct p2m_domain *
+p2m_get_p2m(struct vcpu *v)
+{
+    if (!nestedhvm_is_n2(v))
+        return p2m_get_hostp2m(v->domain);
+
+    return p2m_get_nestedp2m(v, nhvm_vcpu_hostcr3(v));
+}
+
+unsigned long paging_gva_to_gfn(struct vcpu *v,
+                                unsigned long va,
+                                uint32_t *pfec)
+{
+    struct p2m_domain *hostp2m = p2m_get_hostp2m(v->domain);
+    const struct paging_mode *hostmode = paging_get_hostmode(v);
+
+    if ( is_hvm_domain(v->domain)
+        && paging_mode_hap(v->domain) 
+        && nestedhvm_is_n2(v) )
+    {
+        unsigned long gfn;
+        struct p2m_domain *p2m;
+        const struct paging_mode *mode;
+        uint64_t ncr3 = nhvm_vcpu_hostcr3(v);
+
+        /* translate l2 guest va into l2 guest gfn */
+        p2m = p2m_get_nestedp2m(v, ncr3);
+        mode = paging_get_nestedmode(v);
+        gfn = mode->gva_to_gfn(v, p2m, va, pfec);
+
+        /* translate l2 guest gfn into l1 guest gfn */
+        return hostmode->p2m_ga_to_gfn(v, hostp2m, ncr3,
+            gfn << PAGE_SHIFT, pfec);
+    }
+
+    return hostmode->gva_to_gfn(v, hostp2m, va, pfec);
+}
+
 /*
  * Local variables:
  * mode: C
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/arch/x86/mm/paging.c
--- a/xen/arch/x86/mm/paging.c  Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/arch/x86/mm/paging.c  Tue Apr 05 15:44:09 2011 +0200
@@ -26,6 +26,7 @@
 #include <asm/p2m.h>
 #include <asm/hap.h>
 #include <asm/guest_access.h>
+#include <asm/hvm/nestedhvm.h>
 #include <xen/numa.h>
 #include <xsm/xsm.h>
 
@@ -851,21 +852,58 @@
         printk("    paging assistance: ");
         if ( paging_mode_shadow(v->domain) )
         {
-            if ( v->arch.paging.mode )
+            if ( paging_get_hostmode(v) )
                 printk("shadowed %u-on-%u\n",
-                       v->arch.paging.mode->guest_levels,
-                       v->arch.paging.mode->shadow.shadow_levels);
+                       paging_get_hostmode(v)->guest_levels,
+                       paging_get_hostmode(v)->shadow.shadow_levels);
             else
                 printk("not shadowed\n");
         }
-        else if ( paging_mode_hap(v->domain) && v->arch.paging.mode )
+        else if ( paging_mode_hap(v->domain) && paging_get_hostmode(v) )
             printk("hap, %u levels\n",
-                   v->arch.paging.mode->guest_levels);
+                   paging_get_hostmode(v)->guest_levels);
         else
             printk("none\n");
     }
 }
 
+const struct paging_mode *paging_get_mode(struct vcpu *v)
+{
+    if (!nestedhvm_is_n2(v))
+        return paging_get_hostmode(v);
+
+    return paging_get_nestedmode(v);
+}
+
+extern const struct paging_mode *hap_paging_get_mode(struct vcpu *);
+
+void paging_update_nestedmode(struct vcpu *v)
+{
+    ASSERT(nestedhvm_enabled(v->domain));
+    if (nestedhvm_paging_mode_hap(v))
+        /* nested-on-nested */
+        v->arch.paging.nestedmode = hap_paging_get_mode(v);
+    else
+        /* TODO: shadow-on-shadow */
+        v->arch.paging.nestedmode = NULL;
+}
+
+void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
+                            l1_pgentry_t *p, mfn_t table_mfn,
+                            l1_pgentry_t new, unsigned int level)
+{
+    struct domain *d = p2m->domain;
+    struct vcpu *v = current;
+    if ( v->domain != d )
+        v = d->vcpu ? d->vcpu[0] : NULL;
+    if ( likely(v && paging_mode_enabled(d) && paging_get_hostmode(v) != NULL) 
)
+    {
+        return paging_get_hostmode(v)->write_p2m_entry(v, gfn, p, table_mfn,
+                                                       new, level);
+    }
+    else
+        safe_write_pte(p, new);
+}
 
 /*
  * Local variables:
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c    Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/arch/x86/mm/shadow/multi.c    Tue Apr 05 15:44:09 2011 +0200
@@ -837,22 +837,6 @@
     if ( map != NULL ) sh_unmap_domain_page(map);
 }
 
-static inline int
-perms_strictly_increased(u32 old_flags, u32 new_flags) 
-/* Given the flags of two entries, are the new flags a strict
- * increase in rights over the old ones? */
-{
-    u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT);
-    u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT);
-    /* Flip the NX bit, since it's the only one that decreases rights;
-     * we calculate as if it were an "X" bit. */
-    of ^= _PAGE_NX_BIT;
-    nf ^= _PAGE_NX_BIT;
-    /* If the changed bits are all set in the new flags, then rights strictly 
-     * increased between old and new. */
-    return ((of | (of ^ nf)) == nf);
-}
-
 /* type is only used to distinguish grant map pages from ordinary RAM
  * i.e. non-p2m_is_grant() pages are treated as p2m_ram_rw.  */
 static int inline
@@ -3768,7 +3752,8 @@
 
 
 static unsigned long
-sh_gva_to_gfn(struct vcpu *v, unsigned long va, uint32_t *pfec)
+sh_gva_to_gfn(struct vcpu *v, struct p2m_domain *p2m,
+    unsigned long va, uint32_t *pfec)
 /* Called to translate a guest virtual address to what the *guest*
  * pagetables would map it to. */
 {
@@ -4820,7 +4805,7 @@
     struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
 
     /* Translate the VA to a GFN */
-    gfn = sh_gva_to_gfn(v, vaddr, &pfec);
+    gfn = sh_gva_to_gfn(v, p2m, vaddr, &pfec);
     if ( gfn == INVALID_GFN ) 
     {
         if ( is_hvm_vcpu(v) )
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h      Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/include/asm-x86/domain.h      Tue Apr 05 15:44:09 2011 +0200
@@ -210,6 +210,8 @@
 struct paging_vcpu {
     /* Pointers to mode-specific entry points. */
     const struct paging_mode *mode;
+    /* Nested Virtualization: paging mode of nested guest */
+    const struct paging_mode *nestedmode;
     /* HVM guest: last emulate was to a pagetable */
     unsigned int last_write_was_pt:1;
     /* HVM guest: last write emulation succeeds */
@@ -225,6 +227,7 @@
 #define MAX_CPUID_INPUT 40
 typedef xen_domctl_cpuid_t cpuid_input_t;
 
+#define MAX_NESTEDP2M 10
 struct p2m_domain;
 struct time_scale {
     int shift;
@@ -273,6 +276,12 @@
     struct paging_domain paging;
     struct p2m_domain *p2m;
 
+    /* nestedhvm: translate l2 guest physical to host physical */
+    struct p2m_domain *nested_p2m[MAX_NESTEDP2M];
+    spinlock_t nested_p2m_lock;
+    int nested_p2m_locker;
+    const char *nested_p2m_function;
+
     /* NB. protected by d->event_lock and by irq_desc[irq].lock */
     int *irq_pirq;
     int *pirq_irq;
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h     Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/include/asm-x86/hvm/hvm.h     Tue Apr 05 15:44:09 2011 +0200
@@ -374,12 +374,12 @@
 
 int hvm_debug_op(struct vcpu *v, int32_t op);
 
-bool_t hvm_hap_nested_page_fault(unsigned long gpa,
-                                 bool_t gla_valid, unsigned long gla,
-                                 bool_t access_valid, 
-                                 bool_t access_r,
-                                 bool_t access_w,
-                                 bool_t access_x);
+int hvm_hap_nested_page_fault(unsigned long gpa,
+                              bool_t gla_valid, unsigned long gla,
+                              bool_t access_valid, 
+                              bool_t access_r,
+                              bool_t access_w,
+                              bool_t access_x);
 
 #define hvm_msr_tsc_aux(v) ({                                               \
     struct domain *__d = (v)->domain;                                       \
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/include/asm-x86/hvm/nestedhvm.h
--- a/xen/include/asm-x86/hvm/nestedhvm.h       Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/include/asm-x86/hvm/nestedhvm.h       Tue Apr 05 15:44:09 2011 +0200
@@ -60,4 +60,9 @@
 #define nestedhvm_vmswitch_in_progress(v)   \
     (!!vcpu_nestedhvm((v)).nv_vmswitch_in_progress)
 
+void nestedhvm_vmcx_flushtlb(struct p2m_domain *p2m);
+void nestedhvm_vmcx_flushtlbdomain(struct domain *d);
+
+bool_t nestedhvm_is_n2(struct vcpu *v);
+
 #endif /* _HVM_NESTEDHVM_H */
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/include/asm-x86/p2m.h
--- a/xen/include/asm-x86/p2m.h Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/include/asm-x86/p2m.h Tue Apr 05 15:44:09 2011 +0200
@@ -199,7 +199,15 @@
     /* Shadow translated domain: p2m mapping */
     pagetable_t        phys_table;
 
+    /* Same as domain_dirty_cpumask but limited to
+     * this p2m and those physical cpus whose vcpu's are in
+     * guestmode.
+     */
+    cpumask_t          p2m_dirty_cpumask;
+
     struct domain     *domain;   /* back pointer to domain */
+#define CR3_EADDR     (~0ULL)
+    uint64_t           cr3;      /* to identify this p2m for re-use */
 
     /* Pages used to construct the p2m */
     struct page_list_head pages;
@@ -223,6 +231,11 @@
                                                    p2m_type_t ot,
                                                    p2m_type_t nt);
     
+    void               (*write_p2m_entry)(struct p2m_domain *p2m,
+                                          unsigned long gfn, l1_pgentry_t *p,
+                                          mfn_t table_mfn, l1_pgentry_t new,
+                                          unsigned int level);
+
     /* Default P2M access type for each page in the the domain: new pages,
      * swapped in pages, cleared pages, and pages that are ambiquously
      * retyped get this access type.  See definition of p2m_access_t. */
@@ -264,8 +277,26 @@
 /* get host p2m table */
 #define p2m_get_hostp2m(d)      ((d)->arch.p2m)
 
+/* Get p2m table (re)usable for specified cr3.
+ * Automatically destroys and re-initializes a p2m if none found.
+ * If cr3 == 0 then v->arch.hvm_vcpu.guest_cr[3] is used.
+ */
+struct p2m_domain *p2m_get_nestedp2m(struct vcpu *v, uint64_t cr3);
+
+/* If vcpu is in host mode then behaviour matches p2m_get_hostp2m().
+ * If vcpu is in guest mode then behaviour matches p2m_get_nestedp2m().
+ */
+struct p2m_domain *p2m_get_p2m(struct vcpu *v);
+
+#define p2m_is_nestedp2m(p2m)   ((p2m) != p2m_get_hostp2m((p2m->domain)))
+
 #define p2m_get_pagetable(p2m)  ((p2m)->phys_table)
 
+/* Flushes specified p2m table */
+void p2m_flush(struct vcpu *v, struct p2m_domain *p2m);
+/* Flushes all nested p2m tables */
+void p2m_flush_nestedp2m(struct domain *d);
+
 /*
  * The P2M lock.  This protects all updates to the p2m table.
  * Updates are expected to be safe against concurrent reads,
@@ -307,6 +338,38 @@
     (current->processor == (_p2m)->locker)
 
 
+#define nestedp2m_lock_init(_domain)                                  \
+    do {                                                              \
+        spin_lock_init(&(_domain)->arch.nested_p2m_lock);             \
+        (_domain)->arch.nested_p2m_locker = -1;                       \
+        (_domain)->arch.nested_p2m_function = "nobody";               \
+    } while (0)
+
+#define nestedp2m_locked_by_me(_domain)                \
+    (current->processor == (_domain)->arch.nested_p2m_locker)
+
+#define nestedp2m_lock(_domain)                                       \
+    do {                                                              \
+        if ( nestedp2m_locked_by_me(_domain) )                        \
+        {                                                             \
+            printk("Error: p2m lock held by %s\n",                    \
+                   (_domain)->arch.nested_p2m_function);              \
+            BUG();                                                    \
+        }                                                             \
+        spin_lock(&(_domain)->arch.nested_p2m_lock);                  \
+        ASSERT((_domain)->arch.nested_p2m_locker == -1);              \
+        (_domain)->arch.nested_p2m_locker = current->processor;       \
+        (_domain)->arch.nested_p2m_function = __func__;               \
+    } while (0)
+
+#define nestedp2m_unlock(_domain)                                      \
+    do {                                                               \
+        ASSERT(nestedp2m_locked_by_me(_domain));                       \
+        (_domain)->arch.nested_p2m_locker = -1;                        \
+        (_domain)->arch.nested_p2m_function = "nobody";                \
+        spin_unlock(&(_domain)->arch.nested_p2m_lock);                 \
+    } while (0)
+
 /* Extract the type from the PTE flags that store it */
 static inline p2m_type_t p2m_flags_to_type(unsigned long flags)
 {
@@ -424,11 +487,21 @@
 /* Init the datastructures for later use by the p2m code */
 int p2m_init(struct domain *d);
 
+/* PTE flags for various types of p2m entry */
+unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn);
+
 /* Allocate a new p2m table for a domain. 
  *
  * Returns 0 for success or -errno. */
 int p2m_alloc_table(struct p2m_domain *p2m);
 
+/* Find the next level's P2M entry, checking for out-of-range gfn's...
+ * Returns NULL on error.
+ */
+l1_pgentry_t *
+p2m_find_entry(void *table, unsigned long *gfn_remainder,
+               unsigned long gfn, uint32_t shift, uint32_t max);
+
 /* Return all the p2m resources to Xen. */
 void p2m_teardown(struct p2m_domain *p2m);
 void p2m_final_teardown(struct domain *d);
@@ -502,6 +575,8 @@
 int set_mmio_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn);
 int clear_mmio_p2m_entry(struct p2m_domain *p2m, unsigned long gfn);
 
+void nestedp2m_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
+    l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level);
 
 #ifdef __x86_64__
 /* Modify p2m table for shared gfn */
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/include/asm-x86/page.h
--- a/xen/include/asm-x86/page.h        Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/include/asm-x86/page.h        Tue Apr 05 15:44:09 2011 +0200
@@ -391,6 +391,23 @@
     return ((cacheattr & 4) << 5) | ((cacheattr & 3) << 3);
 }
 
+/* return true if permission increased */
+static inline bool_t
+perms_strictly_increased(uint32_t old_flags, uint32_t new_flags)
+/* Given the flags of two entries, are the new flags a strict
+ * increase in rights over the old ones? */
+{
+    uint32_t of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT);
+    uint32_t nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT);
+    /* Flip the NX bit, since it's the only one that decreases rights;
+     * we calculate as if it were an "X" bit. */
+    of ^= _PAGE_NX_BIT;
+    nf ^= _PAGE_NX_BIT;
+    /* If the changed bits are all set in the new flags, then rights strictly
+     * increased between old and new. */
+    return ((of | (of ^ nf)) == nf);
+}
+
 #endif /* !__ASSEMBLY__ */
 
 #define PAGE_ALIGN(x) (((x) + PAGE_SIZE - 1) & PAGE_MASK)
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/include/asm-x86/paging.h
--- a/xen/include/asm-x86/paging.h      Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/include/asm-x86/paging.h      Tue Apr 05 15:44:09 2011 +0200
@@ -108,8 +108,14 @@
     int           (*page_fault            )(struct vcpu *v, unsigned long va,
                                             struct cpu_user_regs *regs);
     int           (*invlpg                )(struct vcpu *v, unsigned long va);
-    unsigned long (*gva_to_gfn            )(struct vcpu *v, unsigned long va,
+    unsigned long (*gva_to_gfn            )(struct vcpu *v,
+                                            struct p2m_domain *p2m,
+                                            unsigned long va,
                                             uint32_t *pfec);
+    unsigned long (*p2m_ga_to_gfn         )(struct vcpu *v,
+                                            struct p2m_domain *p2m,
+                                            unsigned long cr3,
+                                            paddr_t ga, uint32_t *pfec);
     void          (*update_cr3            )(struct vcpu *v, int do_locking);
     void          (*update_paging_modes   )(struct vcpu *v);
     void          (*write_p2m_entry       )(struct vcpu *v, unsigned long gfn,
@@ -219,6 +225,10 @@
  * creation. */
 int paging_enable(struct domain *d, u32 mode);
 
+#define paging_get_hostmode(v)         ((v)->arch.paging.mode)
+#define paging_get_nestedmode(v)       ((v)->arch.paging.nestedmode)
+const struct paging_mode *paging_get_mode(struct vcpu *v);
+void paging_update_nestedmode(struct vcpu *v);
 
 /* Page fault handler
  * Called from pagefault handler in Xen, and from the HVM trap handlers
@@ -233,7 +243,7 @@
 paging_fault(unsigned long va, struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
-    return v->arch.paging.mode->page_fault(v, va, regs);
+    return paging_get_hostmode(v)->page_fault(v, va, regs);
 }
 
 /* Handle invlpg requests on vcpus.
@@ -241,7 +251,7 @@
  * or 0 if it's safe not to do so. */
 static inline int paging_invlpg(struct vcpu *v, unsigned long va)
 {
-    return v->arch.paging.mode->invlpg(v, va);
+    return paging_get_hostmode(v)->invlpg(v, va);
 }
 
 /* Translate a guest virtual address to the frame number that the
@@ -251,11 +261,30 @@
  * walking the tables.  The caller should set the PFEC_page_present bit
  * in pfec[0]; in the failure case, that bit will be cleared if appropriate. */
 #define INVALID_GFN (-1UL)
-static inline unsigned long paging_gva_to_gfn(struct vcpu *v, 
-                                              unsigned long va,
-                                              uint32_t *pfec)
+unsigned long paging_gva_to_gfn(struct vcpu *v,
+                                unsigned long va,
+                                uint32_t *pfec);
+
+/* Translates a guest virtual address to guest physical address
+ * where the specified cr3 is translated to host physical address
+ * using the specified p2m table.
+ * This allows to do page walks in the guest or even in the nested guest.
+ * It returns the guest's gfn or the nested guest's gfn.
+ * Use 'paddr_t' for the guest address so it won't overflow when
+ * guest or nested guest is in 32bit PAE mode.
+ */
+static inline unsigned long paging_p2m_ga_to_gfn(struct vcpu *v,
+                                                 struct p2m_domain *p2m,
+                                                 const struct paging_mode 
*mode,
+                                                 unsigned long cr3,
+                                                 paddr_t ga,
+                                                 uint32_t *pfec)
 {
-    return v->arch.paging.mode->gva_to_gfn(v, va, pfec);
+    if ( is_hvm_domain(v->domain) && paging_mode_hap(v->domain) )
+        return mode->p2m_ga_to_gfn(v, p2m, cr3, ga, pfec);
+
+    /* shadow paging */
+    return paging_gva_to_gfn(v, ga, pfec);
 }
 
 /* Update all the things that are derived from the guest's CR3.
@@ -263,7 +292,7 @@
  * as the value to load into the host CR3 to schedule this vcpu */
 static inline void paging_update_cr3(struct vcpu *v)
 {
-    v->arch.paging.mode->update_cr3(v, 1);
+    paging_get_hostmode(v)->update_cr3(v, 1);
 }
 
 /* Update all the things that are derived from the guest's CR0/CR3/CR4.
@@ -271,7 +300,7 @@
  * has changed, and when bringing up a VCPU for the first time. */
 static inline void paging_update_paging_modes(struct vcpu *v)
 {
-    v->arch.paging.mode->update_paging_modes(v);
+    paging_get_hostmode(v)->update_paging_modes(v);
 }
 
 
@@ -283,7 +312,7 @@
 {
     if ( unlikely(paging_mode_enabled(v->domain) 
                   && v->arch.paging.mode != NULL) )
-        return v->arch.paging.mode->write_guest_entry(v, p, new, gmfn);
+        return paging_get_hostmode(v)->write_guest_entry(v, p, new, gmfn);
     else 
         return (!__copy_to_user(p, &new, sizeof(new)));
 }
@@ -299,7 +328,7 @@
 {
     if ( unlikely(paging_mode_enabled(v->domain) 
                   && v->arch.paging.mode != NULL) )
-        return v->arch.paging.mode->cmpxchg_guest_entry(v, p, old, new, gmfn);
+        return paging_get_hostmode(v)->cmpxchg_guest_entry(v, p, old, new, 
gmfn);
     else 
         return (!cmpxchg_user(p, *old, new));
 }
@@ -327,21 +356,11 @@
  * a pointer to the entry to be written, the MFN in which the entry resides, 
  * the new contents of the entry, and the level in the p2m tree at which 
  * we are writing. */
-static inline void paging_write_p2m_entry(struct domain *d, unsigned long gfn, 
-                                          l1_pgentry_t *p, mfn_t table_mfn,
-                                          l1_pgentry_t new, unsigned int level)
-{
-    struct vcpu *v = current;
-    if ( v->domain != d )
-        v = d->vcpu ? d->vcpu[0] : NULL;
-    if ( likely(v && paging_mode_enabled(d) && v->arch.paging.mode != NULL) )
-    {
-        return v->arch.paging.mode->write_p2m_entry(v, gfn, p, table_mfn,
-                                                    new, level);
-    }
-    else 
-        safe_write_pte(p, new);
-}
+struct p2m_domain;
+
+void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, 
+                            l1_pgentry_t *p, mfn_t table_mfn,
+                            l1_pgentry_t new, unsigned int level);
 
 /* Called from the guest to indicate that the a process is being
  * torn down and its pagetables will soon be discarded */
@@ -362,7 +381,7 @@
     l2_pgentry_t l2e;
 
     if ( unlikely(paging_mode_translate(v->domain)) )
-        return v->arch.paging.mode->guest_map_l1e(v, addr, gl1mfn);
+        return paging_get_hostmode(v)->guest_map_l1e(v, addr, gl1mfn);
 
     /* Find this l1e and its enclosing l1mfn in the linear map */
     if ( __copy_from_user(&l2e, 
@@ -398,7 +417,7 @@
         return;
     }
         
-    v->arch.paging.mode->guest_get_eff_l1e(v, addr, eff_l1e);
+    paging_get_hostmode(v)->guest_get_eff_l1e(v, addr, eff_l1e);
 }
 
 /* Read the guest's l1e that maps this address, from the kernel-mode

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] [xen-unstable] Implement Nested-on-Nested., Xen patchbot-unstable <=