[Xen-changelog] [xen-unstable] EPT: 1GB large page support.

# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1270534496 -3600
# Node ID d7370232060a31d17cd27c9d40a4a6cf2f09935d
# Parent  b20f897d6010457ec507138d450a332eba5147ea
EPT: 1GB large page support.

Alloc 1GB large page for EPT if possible. It also contains the logic
to split large page into small ones (2M or 4K).

Signed-off-by: Dongxiao Xu <dongxiao.xu@xxxxxxxxx>
Signed-off-by: Xiaohui Xin <xiaohui.xin@xxxxxxxxx>
Acked-by: Tim Deegan <Tim.Deegan@xxxxxxxxxx>
---
 xen/arch/x86/hvm/hvm.c             |    5 
 xen/arch/x86/hvm/vmx/vmcs.c        |   16 ++
 xen/arch/x86/hvm/vmx/vmx.c         |    3 
 xen/arch/x86/mm/hap/p2m-ept.c      |  199 +++++++++++++++++++------------------
 xen/include/asm-x86/hvm/vmx/vmcs.h |    7 +
 xen/include/asm-x86/msr-index.h    |    1 
 6 files changed, 134 insertions(+), 97 deletions(-)

diff -r b20f897d6010 -r d7370232060a xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c    Tue Apr 06 07:13:19 2010 +0100
+++ b/xen/arch/x86/hvm/hvm.c    Tue Apr 06 07:14:56 2010 +0100
@@ -966,6 +966,11 @@ bool_t hvm_hap_nested_page_fault(unsigne
     /* Spurious fault? PoD and log-dirty also take this path. */
     if ( p2m_is_ram(p2mt) )
     {
+        /*
+         * Page log dirty is always done with order 0. If this mfn resides in
+         * a large page, we do not change other pages type within that large
+         * page.
+         */
         paging_mark_dirty(current->domain, mfn_x(mfn));
         p2m_change_type(current->domain, gfn, p2m_ram_logdirty, p2m_ram_rw);
         return 1;
diff -r b20f897d6010 -r d7370232060a xen/arch/x86/hvm/vmx/vmcs.c
--- a/xen/arch/x86/hvm/vmx/vmcs.c       Tue Apr 06 07:13:19 2010 +0100
+++ b/xen/arch/x86/hvm/vmx/vmcs.c       Tue Apr 06 07:14:56 2010 +0100
@@ -64,6 +64,7 @@ u32 vmx_secondary_exec_control __read_mo
 u32 vmx_secondary_exec_control __read_mostly;
 u32 vmx_vmexit_control __read_mostly;
 u32 vmx_vmentry_control __read_mostly;
+u8 vmx_ept_super_page_level_limit __read_mostly;
 bool_t cpu_has_vmx_ins_outs_instr_info __read_mostly;
 
 static DEFINE_PER_CPU_READ_MOSTLY(struct vmcs_struct *, host_vmcs);
@@ -183,6 +184,21 @@ static void vmx_init_vmcs_config(void)
             _vmx_secondary_exec_control &=
                 ~(SECONDARY_EXEC_ENABLE_EPT |
                   SECONDARY_EXEC_UNRESTRICTED_GUEST);
+        if ( _vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT )
+        {
+            uint64_t cap;
+            rdmsrl(MSR_IA32_VMX_EPT_VPID_CAP, cap);
+            if ( cap & VMX_EPT_SUPER_PAGE_1G )
+            {
+                vmx_ept_super_page_level_limit = 2;
+                printk("EPT support 1G super page.\n");
+            }
+            else if ( cap & VMX_EPT_SUPER_PAGE_2M )
+            {
+                vmx_ept_super_page_level_limit = 1; 
+                printk("EPT support 2M super page.\n");
+            }
+        }
     }
 
     if ( (_vmx_secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) &&
diff -r b20f897d6010 -r d7370232060a xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c        Tue Apr 06 07:13:19 2010 +0100
+++ b/xen/arch/x86/hvm/vmx/vmx.c        Tue Apr 06 07:14:56 2010 +0100
@@ -1446,7 +1446,8 @@ void start_vmx(void)
     if ( cpu_has_vmx_ept )
         vmx_function_table.hap_supported = 1;
     
-    vmx_function_table.hap_1gb_pgtb = 0;
+    vmx_function_table.hap_1gb_pgtb = ( vmx_ept_super_page_level_limit == 2 ) ?
+                                        1 : 0;
 
     setup_vmcs_dump();
 
diff -r b20f897d6010 -r d7370232060a xen/arch/x86/mm/hap/p2m-ept.c
--- a/xen/arch/x86/mm/hap/p2m-ept.c     Tue Apr 06 07:13:19 2010 +0100
+++ b/xen/arch/x86/mm/hap/p2m-ept.c     Tue Apr 06 07:14:56 2010 +0100
@@ -25,6 +25,7 @@
 #include <asm/domain.h>
 #include <asm/p2m.h>
 #include <asm/hvm/vmx/vmx.h>
+#include <asm/hvm/vmx/vmcs.h>
 #include <xen/iommu.h>
 #include <asm/mtrr.h>
 #include <asm/hvm/cacheattr.h>
@@ -167,6 +168,61 @@ static int ept_next_level(struct domain 
     }
 }
 
+/* It's super page before and we should break down it now. */
+static int ept_split_large_page(struct domain *d,
+                                ept_entry_t **table, u32 *index,
+                                unsigned long gfn, int level)
+{
+    ept_entry_t *prev_table = *table;
+    ept_entry_t *split_table = NULL;
+    ept_entry_t *split_entry = NULL;
+    ept_entry_t *ept_entry = (*table) + (*index);
+    ept_entry_t temp_ept_entry;
+    unsigned long s_gfn, s_mfn;
+    unsigned long offset, trunk;
+    int i;
+
+    /* alloc new page for new ept middle level entry which is
+     * before a leaf super entry
+     */
+
+    if ( !ept_set_middle_entry(d, &temp_ept_entry) )
+        return 0;
+
+    /* split the super page to small next level pages */
+    split_table = map_domain_page(temp_ept_entry.mfn);
+    offset = gfn & ((1UL << (level * EPT_TABLE_ORDER)) - 1);
+    trunk = (1UL << ((level-1) * EPT_TABLE_ORDER));
+
+    for ( i = 0; i < (1UL << EPT_TABLE_ORDER); i++ )
+    {
+        s_gfn = gfn - offset + i * trunk;
+        s_mfn = ept_entry->mfn + i * trunk;
+
+        split_entry = split_table + i;
+        split_entry->emt = ept_entry->emt;
+        split_entry->ipat = ept_entry->ipat;
+
+        split_entry->sp_avail =  (level > 1) ? 1 : 0;
+
+        split_entry->mfn = s_mfn;
+
+        split_entry->avail1 = ept_entry->avail1;
+        split_entry->avail2 = 0;
+        /* last step */
+        split_entry->r = split_entry->w = split_entry->x = 1;
+        ept_p2m_type_to_flags(split_entry, ept_entry->avail1);
+    }
+
+    *ept_entry = temp_ept_entry;
+    
+    *index = offset / trunk;
+    *table = split_table;
+    unmap_domain_page(prev_table);
+
+    return 1;
+}
+
 /*
  * ept_set_entry() computes 'need_modify_vtd_table' for itself,
  * by observing whether any gfn->mfn translations are modified.
@@ -183,13 +239,11 @@ ept_set_entry(struct domain *d, unsigned
     int i;
     int rv = 0;
     int ret = 0;
+    int split_level = 0;
     int walk_level = order / EPT_TABLE_ORDER;
     int direct_mmio = (p2mt == p2m_mmio_direct);
     uint8_t ipat = 0;
     int need_modify_vtd_table = 1;
-
-    /* We only support 4k and 2m pages now */
-    BUG_ON(order && order != EPT_TABLE_ORDER);
 
     if (  order != 0 )
         if ( (gfn & ((1UL << order) - 1)) )
@@ -208,15 +262,15 @@ ept_set_entry(struct domain *d, unsigned
             break;
     }
 
-    /* If order == 9, we should never get SUPERPAGE or PoD.
-     * If order == 0, we should only get POD if we have a POD superpage.
+    /* If order == 0, we should only get POD if we have a POD superpage.
      * If i > walk_level, we need to split the page; otherwise,
      * just behave as normal. */
-    ASSERT(order == 0 || ret == GUEST_TABLE_NORMAL_PAGE);
     ASSERT(ret != GUEST_TABLE_POD_PAGE || i != walk_level);
 
     index = gfn_remainder >> ( i ?  (i * EPT_TABLE_ORDER): order);
     offset = (gfn_remainder & ( ((1 << (i*EPT_TABLE_ORDER)) - 1)));
+
+    split_level = i;
 
     ept_entry = table + index;
 
@@ -231,25 +285,10 @@ ept_set_entry(struct domain *d, unsigned
             ept_entry->ipat = ipat;
             ept_entry->sp_avail = order ? 1 : 0;
 
-            if ( ret == GUEST_TABLE_SUPER_PAGE )
-            {
-                if ( ept_entry->mfn == (mfn_x(mfn) - offset) )
-                    need_modify_vtd_table = 0;  
-                else                  
-                    ept_entry->mfn = mfn_x(mfn) - offset;
-
-                if ( (ept_entry->avail1 == p2m_ram_logdirty)
-                     && (p2mt == p2m_ram_rw) )
-                    for ( i = 0; i < 512; i++ )
-                        paging_mark_dirty(d, mfn_x(mfn) - offset + i);
-            }
+            if ( ept_entry->mfn == mfn_x(mfn) )
+                need_modify_vtd_table = 0;
             else
-            {
-                if ( ept_entry->mfn == mfn_x(mfn) )
-                    need_modify_vtd_table = 0;
-                else
-                    ept_entry->mfn = mfn_x(mfn);
-            }
+                ept_entry->mfn = mfn_x(mfn);
 
             ept_entry->avail1 = p2mt;
             ept_entry->avail2 = 0;
@@ -261,51 +300,22 @@ ept_set_entry(struct domain *d, unsigned
     }
     else
     {
-        /* 
-         * It's super page before, now set one of the 4k pages, so
-         * we should split the 2m page to 4k pages now.
-         */
-        /* Pointers to / into new (split) middle-level table */
-        ept_entry_t *split_table = NULL;
-        ept_entry_t *split_ept_entry = NULL;
-        /* Info about old (superpage) table */
-        unsigned long super_mfn = ept_entry->mfn;
-        p2m_type_t super_p2mt = ept_entry->avail1;
-        /* The new l2 entry which we'll write after we've build the new l1 
table */
-        ept_entry_t l2_ept_entry;
-
-        /* 
-         * Allocate new page for new ept middle level entry which is
-         * before a leaf super entry
-         */
-        if ( !ept_set_middle_entry(d, &l2_ept_entry) )
-            goto out;
-
-        /* Split the super page before to 4k pages */
-        split_table = map_domain_page(l2_ept_entry.mfn);
-        offset = gfn & ((1 << EPT_TABLE_ORDER) - 1);
-
-        for ( i = 0; i < 512; i++ )
-        {
-            split_ept_entry = split_table + i;
-            split_ept_entry->emt = epte_get_entry_emt(d, gfn - offset + i,
-                                                      _mfn(super_mfn + i),
-                                                      &ipat, direct_mmio);
-            split_ept_entry->ipat = ipat;
-            split_ept_entry->sp_avail =  0;
-            /* Don't increment mfn if it's a PoD mfn */
-            if ( super_p2mt != p2m_populate_on_demand )
-                split_ept_entry->mfn = super_mfn + i;
-            else
-                split_ept_entry->mfn = super_mfn; 
-            split_ept_entry->avail1 = super_p2mt;
-            split_ept_entry->avail2 = 0;
-
-            ept_p2m_type_to_flags(split_ept_entry, super_p2mt);
-        }
-
-        /* Set the destinated 4k page as normal */
-        split_ept_entry = split_table + offset;
+        int num = order / EPT_TABLE_ORDER;
+        int level;
+        ept_entry_t *split_ept_entry;
+    
+        if ( num >= cpu_vmx_ept_super_page_level_limit )
+            num = cpu_vmx_ept_super_page_level_limit;
+        for ( level = split_level; level > num ; level-- )
+        {
+            rv = ept_split_large_page(d, &table, &index, gfn, level);
+            if ( !rv )
+                goto out;
+        }
+
+        split_ept_entry = table + index;
+        split_ept_entry->avail1 = p2mt;
+        ept_p2m_type_to_flags(split_ept_entry, p2mt);
         split_ept_entry->emt = epte_get_entry_emt(d, gfn, mfn, &ipat,
                                                   direct_mmio);
         split_ept_entry->ipat = ipat;
@@ -314,12 +324,6 @@ ept_set_entry(struct domain *d, unsigned
             need_modify_vtd_table = 0;
         else
             split_ept_entry->mfn = mfn_x(mfn);
-
-        split_ept_entry->avail1 = p2mt;
-        ept_p2m_type_to_flags(split_ept_entry, p2mt);
-
-        unmap_domain_page(split_table);
-        *ept_entry = l2_ept_entry;
     }
 
     /* Track the highest gfn for which we have ever had a valid mapping */
@@ -336,7 +340,7 @@ out:
     ept_sync_domain(d);
 
     /* Now the p2m table is not shared with vt-d page table */
-    if ( iommu_enabled && need_iommu(d) && need_modify_vtd_table )
+    if ( rv && iommu_enabled && need_iommu(d) && need_modify_vtd_table )
     {
         if ( p2mt == p2m_ram_rw )
         {
@@ -459,7 +463,7 @@ out:
 /* WARNING: Only caller doesn't care about PoD pages.  So this function will
  * always return 0 for PoD pages, not populate them.  If that becomes 
necessary,
  * pass a p2m_query_t type along to distinguish. */
-static ept_entry_t ept_get_entry_content(struct domain *d, unsigned long gfn)
+static ept_entry_t ept_get_entry_content(struct domain *d, unsigned long gfn, 
int *level)
 {
     ept_entry_t *table =
         map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
@@ -487,6 +491,7 @@ static ept_entry_t ept_get_entry_content
     index = gfn_remainder >> (i * EPT_TABLE_ORDER);
     ept_entry = table + index;
     content = *ept_entry;
+    *level = i;
 
  out:
     unmap_domain_page(table);
@@ -579,7 +584,10 @@ void ept_change_entry_emt_with_range(str
     p2m_lock(d->arch.p2m);
     for ( gfn = start_gfn; gfn <= end_gfn; gfn++ )
     {
-        e = ept_get_entry_content(d, gfn);
+        int level = 0;
+        uint64_t trunk = 0;
+
+        e = ept_get_entry_content(d, gfn, &level);
         if ( !p2m_has_emt(e.avail1) )
             continue;
 
@@ -588,25 +596,24 @@ void ept_change_entry_emt_with_range(str
 
         if ( e.sp_avail )
         {
-            if ( !(gfn & ((1 << EPT_TABLE_ORDER) - 1)) &&
-                 ((gfn + 0x1FF) <= end_gfn) )
+            while ( level )
             {
-                /* 
-                 * gfn assigned with 2M, and the end covers more than 2m areas.
-                 * Set emt for super page.
-                 */
-                order = EPT_TABLE_ORDER;
-                if ( need_modify_ept_entry(d, gfn, mfn, e.ipat, e.emt, 
e.avail1) )
-                    ept_set_entry(d, gfn, mfn, order, e.avail1);
-                gfn += 0x1FF;
-            }
-            else
-            {
-                /* Change emt for partial entries of the 2m area. */
-                if ( need_modify_ept_entry(d, gfn, mfn, e.ipat, e.emt, 
e.avail1) )
-                    ept_set_entry(d, gfn, mfn, order, e.avail1);
-                gfn = ((gfn >> EPT_TABLE_ORDER) << EPT_TABLE_ORDER) + 0x1FF;
-            }
+                trunk = (1UL << (level * EPT_TABLE_ORDER)) - 1;
+                if ( !(gfn & trunk) && (gfn + trunk <= end_gfn) )
+                {
+                    /* gfn assigned with 2M or 1G, and the end covers more than
+                     * the super page areas.
+                     * Set emt for super page.
+                     */
+                    order = level * EPT_TABLE_ORDER;
+                    if ( need_modify_ept_entry(d, gfn, mfn, 
+                          e.ipat, e.emt, e.avail1) )
+                        ept_set_entry(d, gfn, mfn, order, e.avail1);
+                    gfn += trunk;
+                    break;
+                }
+                level--;
+             }
         }
         else /* gfn assigned with 4k */
         {
diff -r b20f897d6010 -r d7370232060a xen/include/asm-x86/hvm/vmx/vmcs.h
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h        Tue Apr 06 07:13:19 2010 +0100
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h        Tue Apr 06 07:14:56 2010 +0100
@@ -175,6 +175,11 @@ extern u32 vmx_secondary_exec_control;
 extern u32 vmx_secondary_exec_control;
 
 extern bool_t cpu_has_vmx_ins_outs_instr_info;
+
+extern u8 vmx_ept_super_page_level_limit;
+
+#define VMX_EPT_SUPER_PAGE_2M              0x00010000
+#define VMX_EPT_SUPER_PAGE_1G              0x00020000
 
 #define cpu_has_wbinvd_exiting \
     (vmx_secondary_exec_control & SECONDARY_EXEC_WBINVD_EXITING)
@@ -203,6 +208,8 @@ extern bool_t cpu_has_vmx_ins_outs_instr
      SECONDARY_EXEC_UNRESTRICTED_GUEST)
 #define cpu_has_vmx_ple \
     (vmx_secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
+#define cpu_vmx_ept_super_page_level_limit  \
+    vmx_ept_super_page_level_limit
 
 /* GUEST_INTERRUPTIBILITY_INFO flags. */
 #define VMX_INTR_SHADOW_STI             0x00000001
diff -r b20f897d6010 -r d7370232060a xen/include/asm-x86/msr-index.h
--- a/xen/include/asm-x86/msr-index.h   Tue Apr 06 07:13:19 2010 +0100
+++ b/xen/include/asm-x86/msr-index.h   Tue Apr 06 07:14:56 2010 +0100
@@ -166,6 +166,7 @@
 #define MSR_IA32_VMX_CR4_FIXED0                 0x488
 #define MSR_IA32_VMX_CR4_FIXED1                 0x489
 #define MSR_IA32_VMX_PROCBASED_CTLS2            0x48b
+#define MSR_IA32_VMX_EPT_VPID_CAP               0x48c
 #define MSR_IA32_VMX_TRUE_PINBASED_CTLS         0x48d
 #define MSR_IA32_VMX_TRUE_PROCBASED_CTLS        0x48e
 #define MSR_IA32_VMX_TRUE_EXIT_CTLS             0x48f

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
WARNING - OLD ARCHIVES

xen-changelog

[Xen-changelog] [xen-unstable] EPT: 1GB large page support.