At 10:41 +0100 on 22 Apr (1271932887), Qing He wrote:
> This patch adds virtual ept capability to L1.
> It's implemented as a simple per vCPU vTLB like component
> independent to domain wide p2m.
>
> Signed-off-by: Qing He <qing.he@xxxxxxxxx>
> diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/arch/x86/hvm/vmx/nest.c
> --- a/xen/arch/x86/hvm/vmx/nest.c Thu Apr 22 22:30:09 2010 +0800
> +++ b/xen/arch/x86/hvm/vmx/nest.c Thu Apr 22 22:30:10 2010 +0800
> @@ -26,6 +26,7 @@
> #include <asm/hvm/vmx/vmx.h>
> #include <asm/hvm/vmx/vvmcs.h>
> #include <asm/hvm/vmx/nest.h>
> +#include <asm/hvm/vmx/vept.h>
>
> /*
> * VMX instructions support functions
> @@ -295,6 +296,9 @@
> __vmptrld(virt_to_maddr(nest->hvmcs));
> v->arch.hvm_vmx.launched = 0;
>
> + nest->geptp = 0;
> + nest->vept = vept_init(v);
> +
> vmreturn(regs, VMSUCCEED);
>
> out:
> @@ -313,6 +317,9 @@
> if ( unlikely(!nest->guest_vmxon_pa) )
> goto invalid_op;
>
> + vept_teardown(nest->vept);
> + nest->vept = 0;
> +
> nest->guest_vmxon_pa = 0;
> __vmpclear(virt_to_maddr(nest->svmcs));
>
> @@ -529,6 +536,67 @@
> return vmx_nest_handle_vmresume(regs);
> }
>
> +int vmx_nest_handle_invept(struct cpu_user_regs *regs)
> +{
> + struct vcpu *v = current;
> + struct vmx_inst_decoded decode;
> + struct vmx_nest_struct *nest = &v->arch.hvm_vmx.nest;
> + mfn_t mfn;
> + u64 eptp;
> + int type;
> +
> + if ( unlikely(!nest->guest_vmxon_pa) )
> + goto invalid_op;
> +
> + decode_vmx_inst(regs, &decode);
> +
> + hvm_copy_from_guest_virt(&eptp, decode.mem, sizeof(eptp), 0);
> + type = reg_read(regs, decode.reg2);
Needs error handling like the other new instructions.
> + /* TODO: physical invept on other cpus */
?
> + switch ( type )
> + {
> + case 1:
> + mfn = vept_invalidate(nest->vept, eptp);
> + if ( eptp == nest->geptp )
> + nest->geptp = 0;
> +
> + if ( __mfn_valid(mfn_x(mfn)) )
> + __invept(1, mfn_x(mfn) << PAGE_SHIFT | (eptp & 0xfff), 0);
> + break;
> + case 2:
> + vept_invalidate_all(nest->vept);
> + nest->geptp = 0;
> + break;
> + default:
> + gdprintk(XENLOG_ERR, "nest: unsupported invept type %d\n", type);
> + break;
> + }
> +
> + vmreturn(regs, VMSUCCEED);
> +
> + return X86EMUL_OKAY;
> +
> +invalid_op:
> + hvm_inject_exception(TRAP_invalid_op, 0, 0);
> + return X86EMUL_EXCEPTION;
> +}
> +
> +int vmx_nest_vept(struct vcpu *v)
> +{
> + struct vmx_nest_struct *nest = &v->arch.hvm_vmx.nest;
> + int r = 0;
> +
> + if ( paging_mode_hap(v->domain) &&
> + (__get_vvmcs(nest->vvmcs, CPU_BASED_VM_EXEC_CONTROL) &
> + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
> + (__get_vvmcs(nest->vvmcs, SECONDARY_VM_EXEC_CONTROL) &
> + SECONDARY_EXEC_ENABLE_EPT) )
> + r = 1;
> +
> + return r;
> +}
> +
> /*
> * Nested VMX context switch
> */
> @@ -739,7 +807,14 @@
> vvmcs_to_shadow(nest->vvmcs, CR0_GUEST_HOST_MASK);
> vvmcs_to_shadow(nest->vvmcs, CR4_GUEST_HOST_MASK);
>
> - /* TODO: PDPTRs for nested ept */
> + if ( vmx_nest_vept(v) )
> + {
> + vvmcs_to_shadow(nest->vvmcs, GUEST_PDPTR0);
> + vvmcs_to_shadow(nest->vvmcs, GUEST_PDPTR1);
> + vvmcs_to_shadow(nest->vvmcs, GUEST_PDPTR2);
> + vvmcs_to_shadow(nest->vvmcs, GUEST_PDPTR3);
> + }
> +
> /* TODO: CR3 target control */
> }
>
> @@ -787,14 +862,32 @@
> }
> #endif
>
> +
> + /* loading EPT_POINTER for L2 */
> + if ( vmx_nest_vept(v) )
> + {
> + u64 geptp;
> + mfn_t mfn;
> +
> + geptp = __get_vvmcs(nest->vvmcs, EPT_POINTER);
> + if ( geptp != nest->geptp )
> + {
> + mfn = vept_load_eptp(nest->vept, geptp);
What if vept_load_eptp() returns INVALID_MFN?
> + nest->geptp = geptp;
> +
> + __vmwrite(EPT_POINTER, (mfn_x(mfn) << PAGE_SHIFT) | 0x1e);
> +#ifdef __i386__
> + __vmwrite(EPT_POINTER_HIGH, (mfn_x(mfn) << PAGE_SHIFT) >> 32);
> +#endif
> + }
> + }
> +
> regs->rip = __get_vvmcs(nest->vvmcs, GUEST_RIP);
> regs->rsp = __get_vvmcs(nest->vvmcs, GUEST_RSP);
> regs->rflags = __get_vvmcs(nest->vvmcs, GUEST_RFLAGS);
>
> /* updating host cr0 to sync TS bit */
> __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
> -
> - /* TODO: EPT_POINTER */
> }
>
> static void sync_vvmcs_guest_state(struct vmx_nest_struct *nest)
> @@ -1064,8 +1157,26 @@
> break;
> }
>
> + case EXIT_REASON_EPT_VIOLATION:
> + {
> + unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
> + paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
> +#ifdef __i386__
> + gpa |= (paddr_t)__vmread(GUEST_PHYSICAL_ADDRESS_HIGH) << 32;
> +#endif
> + if ( vmx_nest_vept(v) )
> + {
> + if ( !vept_ept_violation(nest->vept, nest->geptp,
> + exit_qualification, gpa) )
> + bypass_l0 = 1;
> + else
> + nest->vmexit_pending = 1;
Since bypass_l0 is set from vmexit_pending() here it looks like it's
always going to be set. Does that mean we never handle a real EPT
violation at L0? I would expect there to be three possible outcomes
here: give the violation to L1, give it to L0, or fix it in the vept and
discard it.
> + }
> +
> + break;
> + }
> +
> case EXIT_REASON_WBINVD:
> - case EXIT_REASON_EPT_VIOLATION:
> case EXIT_REASON_EPT_MISCONFIG:
> case EXIT_REASON_EXTERNAL_INTERRUPT:
> /* pass to L0 handler */
> @@ -1229,11 +1340,14 @@
> data = (data << 32) | eax;
> break;
> case MSR_IA32_VMX_PROCBASED_CTLS:
> + mask = paging_mode_hap(current->domain)?
> + 0: CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
> +
> rdmsr(regs->ecx, eax, edx);
> #define REMOVED_EXEC_CONTROL_CAP (CPU_BASED_TPR_SHADOW \
> - | CPU_BASED_ACTIVATE_MSR_BITMAP \
> - | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
> + | CPU_BASED_ACTIVATE_MSR_BITMAP)
> data = edx & ~REMOVED_EXEC_CONTROL_CAP;
> + data = edx & ~mask;
> data = (data << 32) | eax;
> break;
> case MSR_IA32_VMX_EXIT_CTLS:
> @@ -1254,12 +1368,20 @@
> data = (data << 32) | eax;
> break;
> case MSR_IA32_VMX_PROCBASED_CTLS2:
> - mask = 0;
> + mask = paging_mode_hap(current->domain)?
> + SECONDARY_EXEC_ENABLE_EPT : 0;
>
> rdmsr(regs->ecx, eax, edx);
> data = edx & mask;
> data = (data << 32) | eax;
> break;
> + case MSR_IA32_VMX_EPT_VPID_CAP:
> + rdmsr(regs->ecx, eax, edx);
> +#define REMOVED_EPT_VPID_CAP_HIGH ( 1 | 1<<8 | 1<<9 | 1<<10 | 1<<11 )
> +#define REMOVED_EPT_VPID_CAP_LOW ( 1<<16 | 1<<17 | 1<<26 )
> + data = edx & ~REMOVED_EPT_VPID_CAP_HIGH;
> + data = (data << 32) | (eax & ~REMOVED_EPT_VPID_CAP_LOW);
> + break;
>
> /* pass through MSRs */
> case IA32_FEATURE_CONTROL_MSR:
> diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/arch/x86/hvm/vmx/vept.c
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/xen/arch/x86/hvm/vmx/vept.c Thu Apr 22 22:30:10 2010 +0800
> @@ -0,0 +1,574 @@
> +/*
> + * vept.c: virtual EPT for nested virtualization
> + *
> + * Copyright (c) 2010, Intel Corporation.
> + * Author: Qing He <qing.he@xxxxxxxxx>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> with
> + * this program; if not, write to the Free Software Foundation, Inc., 59
> Temple
> + * Place - Suite 330, Boston, MA 02111-1307 USA.
> + *
> + */
> +
> +#include <xen/config.h>
> +#include <xen/types.h>
> +#include <xen/list.h>
> +#include <xen/mm.h>
> +#include <xen/paging.h>
> +#include <xen/domain_page.h>
> +#include <xen/sched.h>
> +#include <asm/page.h>
> +#include <xen/numa.h>
> +#include <asm/hvm/vmx/vmx.h>
> +#include <asm/hvm/vmx/vept.h>
> +
> +#undef mfn_to_page
> +#define mfn_to_page(_m) __mfn_to_page(mfn_x(_m))
> +#undef mfn_valid
> +#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
> +#undef page_to_mfn
> +#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
> +
> +/*
> + * This virtual EPT implementation is independent to p2m facility
> + * and has some different characteristics. It works in a similar
> + * way as shadow page table (guest table and host table composition),
> + * but is per-vcpu, and of vTLB style
> + * - per vCPU so no lock is required
What happens when dom0 changes domU's p2m table? Don't you need to
shoot down existing vEPT tables from a foreign CPU?
> + * - vTLB style signifies honoring all invalidations, and not
> + * write protection. Unlike ordinary page table, since EPT updates
> + * and invalidations are minimal in a well written VMM, overhead
> + * is also minimized.
> + *
> + * The physical root is loaded directly to L2 sVMCS, without entering
> + * any other host controls. Multiple `cache slots' are maintained
> + * for multiple guest EPTPs, with simple LRU replacement.
> + *
> + * One of the limitations so far, is that it doesn't work with
> + * L0 emulation code, so L1 p2m_mmio_direct on top of L0 p2m_mmio_dm
> + * is not supported as for now.
Is this something you intend to fix before we check it in?
> + */
> +
> +#define VEPT_MAX_SLOTS 8
> +#define VEPT_ALLOCATION_SIZE 512
> +
> +struct vept_slot {
> + u64 eptp; /* guest eptp */
> + mfn_t root; /* root of phys table */
> + struct list_head list;
> +
> + struct page_list_head page_list;
> +};
> +
> +struct vept {
> + struct list_head used_slots; /* lru: new->tail, old->head */
> + struct list_head free_slots;
> +
> + int total_pages;
> + int free_pages;
> + struct page_list_head freelist;
> +
> + struct vcpu *vcpu;
> +};
> +
> +
> +static struct vept_slot *__get_eptp_slot(struct vept *vept, u64 geptp)
> +{
> + struct vept_slot *slot, *tmp;
> +
> + list_for_each_entry_safe( slot, tmp, &vept->used_slots, list )
> + if ( slot->eptp == geptp )
> + return slot;
> +
> + return NULL;
> +}
> +
> +static struct vept_slot *get_eptp_slot(struct vept *vept, u64 geptp)
> +{
> + struct vept_slot *slot;
> +
> + slot = __get_eptp_slot(vept, geptp);
> + if ( slot != NULL )
> + list_del(&slot->list);
> +
> + return slot;
> +}
> +
> +static void __clear_slot(struct vept *vept, struct vept_slot *slot)
> +{
> + struct page_info *pg;
> +
> + slot->eptp = 0;
> +
> + while ( !page_list_empty(&slot->page_list) )
> + {
> + pg = page_list_remove_head(&slot->page_list);
> + page_list_add_tail(pg, &vept->freelist);
> +
> + vept->free_pages++;
> + }
> +}
> +
> +static struct vept_slot *get_free_slot(struct vept *vept)
> +{
> + struct vept_slot *slot = NULL;
> +
> + if ( !list_empty(&vept->free_slots) )
> + {
> + slot = list_entry(vept->free_slots.next, struct vept_slot, list);
> + list_del(&slot->list);
> + }
> + else if ( !list_empty(&vept->used_slots) )
> + {
> + slot = list_entry(vept->used_slots.next, struct vept_slot, list);
> + list_del(&slot->list);
> + __clear_slot(vept, slot);
> + }
> +
> + return slot;
> +}
> +
> +static void clear_all_slots(struct vept *vept)
> +{
> + struct vept_slot *slot, *tmp;
> +
> + list_for_each_entry_safe( slot, tmp, &vept->used_slots, list )
> + {
> + list_del(&slot->list);
> + __clear_slot(vept, slot);
> + list_add_tail(&slot->list, &vept->free_slots);
> + }
> +}
> +
> +static int free_some_pages(struct vept *vept, struct vept_slot *curr)
> +{
> + struct vept_slot *slot;
> + int r = 0;
> +
> + if ( !list_empty(&vept->used_slots) )
> + {
> + slot = list_entry(vept->used_slots.next, struct vept_slot, list);
> + if ( slot != curr )
> + {
> + list_del(&slot->list);
> + __clear_slot(vept, slot);
> + list_add_tail(&slot->list, &vept->free_slots);
> +
> + r = 1;
> + }
> + }
> +
> + return r;
> +}
> +
> +struct vept *vept_init(struct vcpu *v)
> +{
> + struct vept *vept;
> + struct vept_slot *slot;
> + struct page_info *pg;
> + int i;
> +
> + vept = xmalloc(struct vept);
> + if ( vept == NULL )
> + goto out;
> +
> + memset(vept, 0, sizeof(*vept));
> + vept->vcpu = v;
> +
> + INIT_PAGE_LIST_HEAD(&vept->freelist);
> + INIT_LIST_HEAD(&vept->used_slots);
> + INIT_LIST_HEAD(&vept->free_slots);
> +
> + for ( i = 0; i < VEPT_MAX_SLOTS; i++ )
> + {
> + slot = xmalloc(struct vept_slot);
> + if ( slot == NULL )
> + break;
> +
> + memset(slot, 0, sizeof(*slot));
> +
> + INIT_LIST_HEAD(&slot->list);
> + INIT_PAGE_LIST_HEAD(&slot->page_list);
> +
> + list_add(&slot->list, &vept->free_slots);
> + }
> +
> + for ( i = 0; i < VEPT_ALLOCATION_SIZE; i++ )
Why a fixed 2MB allocation? What if your nested domains are very large?
> + {
> + pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(v->domain)));
Shouldn't this be allocated from the paging pool like other EPT memory?
> + if ( pg == NULL )
> + break;
Return an error?
> + page_list_add_tail(pg, &vept->freelist);
> + vept->total_pages++;
> + vept->free_pages++;
> + }
> +
> + out:
> + return vept;
> +}
> +
> +void vept_teardown(struct vept *vept)
> +{
> + struct page_info *pg;
> + struct vept_slot *slot, *tmp;
> +
> + clear_all_slots(vept);
> +
> + while ( !page_list_empty(&vept->freelist) )
> + {
> + pg = page_list_remove_head(&vept->freelist);
> + free_domheap_page(pg);
> + vept->free_pages++;
> + vept->total_pages++;
> + }
> +
> + list_for_each_entry_safe( slot, tmp, &vept->free_slots, list )
> + xfree(slot);
> +
> + xfree(vept);
> +}
> +
> +mfn_t vept_load_eptp(struct vept *vept, u64 geptp)
> +{
> + struct page_info *pg;
> + struct vept_slot *slot;
> + mfn_t mfn = _mfn(INVALID_MFN);
> + void *addr;
> +
> + ASSERT(vept->vcpu == current);
> +
> + slot = get_eptp_slot(vept, geptp);
> + if ( slot == NULL )
> + {
> + slot = get_free_slot(vept);
> + if ( unlikely(slot == NULL) )
> + {
> + gdprintk(XENLOG_ERR, "nest: can't get free slot\n");
> + return mfn;
> + }
> +
> + while ( !vept->free_pages )
> + if ( !free_some_pages(vept, slot) )
> + {
> + slot->eptp = 0;
> + list_add_tail(&slot->list, &vept->free_slots);
> + gdprintk(XENLOG_ERR, "nest: vept no free pages\n");
> +
> + return mfn;
> + }
> +
> + vept->free_pages--;
> + pg = page_list_remove_head(&vept->freelist);
> +
> + mfn = page_to_mfn(pg);
> + addr = map_domain_page(mfn_x(mfn));
> + clear_page(addr);
> + unmap_domain_page(addr);
> + page_list_add_tail(pg, &slot->page_list);
> + slot->eptp = geptp;
> + slot->root = mfn;
> + }
> +
> + mfn = slot->root;
> + list_add_tail(&slot->list, &vept->used_slots);
> +
> + return mfn;
> +}
> +
> +mfn_t vept_invalidate(struct vept *vept, u64 geptp)
> +{
> + struct vept_slot *slot;
> + mfn_t mfn = _mfn(INVALID_MFN);
> +
> + ASSERT(vept->vcpu == current);
> +
> + slot = get_eptp_slot(vept, geptp);
> + if ( slot != NULL )
> + {
> + mfn = slot->root;
> + __clear_slot(vept, slot);
> + list_add_tail(&slot->list, &vept->free_slots);
> + }
> +
> + return mfn;
> +}
> +
> +void vept_invalidate_all(struct vept *vept)
> +{
> + ASSERT(vept->vcpu == current);
> +
> + clear_all_slots(vept);
> +}
> +
> +/*
> + * guest EPT walk and EPT violation
> + */
> +struct ept_walk {
> + unsigned long gfn;
> + unsigned long gfn_remainder;
> + ept_entry_t l4e, l3e, l2e, l1e;
> + mfn_t l4mfn, l3mfn, l2mfn, l1mfn;
> + int sp;
> +};
> +typedef struct ept_walk ept_walk_t;
> +
> +#define GEPT_NORMAL_PAGE 0
> +#define GEPT_SUPER_PAGE 1
> +#define GEPT_NOT_PRESENT 2
> +static int guest_ept_next_level(struct vcpu *v, ept_entry_t **table,
> + unsigned long *gfn_remainder, int level, u32 *ar,
> + ept_entry_t *entry, mfn_t *next_mfn)
> +{
> + int index;
> + ept_entry_t *ept_entry;
> + ept_entry_t *next;
> + p2m_type_t p2mt;
> + int rc = GEPT_NORMAL_PAGE;
> + mfn_t mfn;
> +
> + index = *gfn_remainder >> (level * EPT_TABLE_ORDER);
> +
> + ept_entry = (*table) + index;
> + *entry = *ept_entry;
> + *ar &= entry->epte & 0x7;
> +
> + *gfn_remainder &= (1UL << (level * EPT_TABLE_ORDER)) - 1;
> +
> + if ( !(ept_entry->epte & 0x7) )
> + rc = GEPT_NOT_PRESENT;
> + else if ( ept_entry->sp_avail )
> + rc = GEPT_SUPER_PAGE;
> + else
> + {
> + mfn = gfn_to_mfn(v->domain, ept_entry->mfn, &p2mt);
> + if ( !p2m_is_ram(p2mt) )
> + return GEPT_NOT_PRESENT;
> +
> + if ( next_mfn )
> + {
> + next = map_domain_page(mfn_x(mfn));
> + unmap_domain_page(*table);
> +
> + *table = next;
> + *next_mfn = mfn;
> + }
> + }
> +
> + return rc;
> +}
> +
> +static u32 guest_walk_ept(struct vcpu *v, ept_walk_t *gw,
> + u64 geptp, u64 ggpa)
> +{
> + ept_entry_t *table;
> + p2m_type_t p2mt;
> + int rc;
> + u32 ar = 0x7;
> +
> + unsigned long gfn = (unsigned long) (ggpa >> PAGE_SHIFT);
> + unsigned long gfn_remainder = gfn;
> +
> + memset(gw, 0, sizeof(*gw));
> + gw->gfn = gfn;
> + gw->sp = 0;
> +
> + gw->l4mfn = gfn_to_mfn(v->domain, geptp >> PAGE_SHIFT, &p2mt);
> + if ( !p2m_is_ram(p2mt) )
> + return 0;
> +
> + table = map_domain_page(mfn_x(gw->l4mfn));
> +
> + rc = guest_ept_next_level(v, &table, &gfn_remainder, 3, &ar,
> + &gw->l4e, &gw->l3mfn);
> +
> + if ( rc )
> + goto out;
> +
> + rc = guest_ept_next_level(v, &table, &gfn_remainder, 2, &ar,
> + &gw->l3e, &gw->l2mfn);
> +
> + if ( rc == GEPT_SUPER_PAGE )
> + gw->sp = 2;
> + if ( rc )
> + goto out;
> +
> + rc = guest_ept_next_level(v, &table, &gfn_remainder, 1, &ar,
> + &gw->l2e, &gw->l1mfn);
> +
> + if ( rc == GEPT_SUPER_PAGE )
> + gw->sp = 1;
> + if ( rc )
> + goto out;
> +
> + rc = guest_ept_next_level(v, &table, &gfn_remainder, 0, &ar,
> + &gw->l1e, NULL);
> +
> + out:
> + gw->gfn_remainder = gfn_remainder;
> + unmap_domain_page(*table);
> + return ar;
> +}
> +
> +static void epte_set_ar_bits(ept_entry_t *entry, unsigned long ar)
> +{
> + entry->epte &= ~0x7f;
> + entry->epte |= ar & 0x7f;
> +}
> +
> +static int shadow_ept_next_level(struct vept *vept, struct vept_slot *slot,
> + ept_entry_t **table, unsigned long *gfn_remainder,
> + int level, u32 *ar, ept_entry_t gentry)
> +{
> + int index;
> + ept_entry_t *sentry;
> + ept_entry_t *next;
> + mfn_t mfn;
> + struct page_info *pg;
> +
> + index = *gfn_remainder >> (level * EPT_TABLE_ORDER);
> +
> + sentry = (*table) + index;
> + *ar = sentry->epte & 0x7;
> +
> + *gfn_remainder &= (1UL << (level * EPT_TABLE_ORDER)) - 1;
> +
> + if ( !(sentry->epte & 0x7) )
> + {
> + while ( !vept->free_pages )
> + if ( !free_some_pages(vept, slot) )
> + {
> + gdprintk(XENLOG_ERR, "nest: vept no free pages\n");
> + return 0;
> + }
> +
> + vept->free_pages--;
> + pg = page_list_remove_head(&vept->freelist);
> + page_list_add_tail(pg, &slot->page_list);
> + mfn = page_to_mfn(pg);
> + next = map_domain_page(mfn_x(mfn));
> + clear_page(next);
> +
> + sentry->mfn = mfn_x(mfn);
> + }
> + else
> + {
> + next = map_domain_page(sentry->mfn);
> + }
> +
> + epte_set_ar_bits(sentry, gentry.epte);
> +
> + unmap_domain_page(*table);
> + *table = next;
> +
> + return 1;
> +}
> +
> +int vept_ept_violation(struct vept *vept, u64 geptp,
> + unsigned long qualification, paddr_t addr)
> +{
> + ept_walk_t gw;
> + struct vept_slot *slot;
> + ept_entry_t *table, *gept;
> + ept_entry_t *sentry, *gentry;
> + u32 old_entry, sp_ar = 0;
> + p2m_type_t p2mt;
> + unsigned long mfn_start = 0;
> + unsigned long gfn_remainder;
> + int rc, i;
> +
> + ASSERT(vept->vcpu == current);
> +
> + slot = __get_eptp_slot(vept, geptp);
> + if ( unlikely(slot == NULL) )
> + return 0;
> +
> + rc = guest_walk_ept(vept->vcpu, &gw, geptp, addr);
> +
> + if ( !(rc & (qualification & 0x7)) ) /* inject to guest */
> + return 1;
> +
> + if ( gw.sp == 2 ) /* 1G */
> + {
> + sp_ar = gw.l3e.epte & 0x7;
> + mfn_start = gw.l3e.mfn +
> + (gw.gfn_remainder & (~(1 << EPT_TABLE_ORDER) - 1));
> + }
> + if ( gw.sp == 1 ) /* 2M */
> + {
> + sp_ar = gw.l2e.epte & 0x7;
> + mfn_start = gw.l2e.mfn;
> + }
> + else
> + mfn_start = 0;
> +
> + table = map_domain_page(mfn_x(slot->root));
> + gfn_remainder = gw.gfn;
> +
> + shadow_ept_next_level(vept, slot, &table, &gfn_remainder, 3,
> + &old_entry, gw.l4e);
What if shadow_ept_next_level() returns 0 ?
> + shadow_ept_next_level(vept, slot, &table, &gfn_remainder, 2,
> + &old_entry, gw.l3e);
Ditto
> + shadow_ept_next_level(vept, slot, &table, &gfn_remainder, 1,
> + &old_entry, (gw.sp == 2) ? gw.l3e : gw.l2e);
Ditto
> + /* if l1p is just allocated, do a full prefetch */
> + if ( !old_entry && !gw.sp )
> + {
> + gept = map_domain_page(mfn_x(gw.l1mfn));
> + for ( i = 0; i < 512; i++ )
> + {
> + gentry = gept + i;
> + sentry = table + i;
> + if ( gentry->epte & 0x7 )
> + {
> + sentry->mfn = mfn_x(gfn_to_mfn_guest(vept->vcpu->domain,
> + gentry->mfn, &p2mt));
> + epte_set_ar_bits(sentry, gentry->epte);
> + }
> + else
> + sentry->epte = 0;
> + }
> + unmap_domain_page(gept);
> + }
> + else if ( !old_entry && gw.sp )
> + {
> + for ( i = 0; i < 512; i++ )
> + {
> + sentry = table + i;
> + sentry->mfn = mfn_x(gfn_to_mfn_guest(vept->vcpu->domain,
> + mfn_start + i, &p2mt));
> + epte_set_ar_bits(sentry, sp_ar);
> + }
> + }
> + else if ( old_entry && !gw.sp )
> + {
> + i = gw.gfn & ((1 << EPT_TABLE_ORDER) - 1);
> + sentry = table + i;
> + sentry->mfn = mfn_x(gfn_to_mfn_guest(vept->vcpu->domain,
> + gw.l1e.mfn, &p2mt));
> + epte_set_ar_bits(sentry, gw.l1e.epte);
> + }
> + else // old_entry && gw.sp
> + {
> + i = gw.gfn & ((1 << EPT_TABLE_ORDER) - 1);
> + sentry = table + i;
> + sentry->mfn = mfn_x(gfn_to_mfn_guest(vept->vcpu->domain,
> + mfn_start + i, &p2mt));
> + epte_set_ar_bits(sentry, sp_ar);
> + }
> +
> + unmap_domain_page(table);
> + return 0;
> +}
> diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/arch/x86/hvm/vmx/vmx.c
> --- a/xen/arch/x86/hvm/vmx/vmx.c Thu Apr 22 22:30:09 2010 +0800
> +++ b/xen/arch/x86/hvm/vmx/vmx.c Thu Apr 22 22:30:10 2010 +0800
> @@ -1032,6 +1032,14 @@
> p2m_type_t p2mt;
> char *p;
>
> + /*
> + * If in nesting EPT operation, L0 doesn't have the knowledge on
> + * how to interpret CR3, it's L1's responsibility to provide
> + * GUEST_PDPTRn, we rely solely on them.
> + */
> + if ( v->arch.hvm_vcpu.in_nesting && vmx_nest_vept(v) )
> + return;
> +
> /* EPT needs to load PDPTRS into VMCS for PAE. */
> if ( !hvm_pae_enabled(v) || (v->arch.hvm_vcpu.guest_efer & EFER_LMA) )
> return;
> @@ -2705,6 +2713,11 @@
> if ( vmx_nest_handle_vmxon(regs) == X86EMUL_OKAY )
> __update_guest_eip(inst_len);
> break;
> + case EXIT_REASON_INVEPT:
> + inst_len = __get_instruction_length();
> + if ( vmx_nest_handle_invept(regs) == X86EMUL_OKAY )
> + __update_guest_eip(inst_len);
> + break;
>
> case EXIT_REASON_MWAIT_INSTRUCTION:
> case EXIT_REASON_MONITOR_INSTRUCTION:
> diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/include/asm-x86/hvm/vmx/nest.h
> --- a/xen/include/asm-x86/hvm/vmx/nest.h Thu Apr 22 22:30:09 2010 +0800
> +++ b/xen/include/asm-x86/hvm/vmx/nest.h Thu Apr 22 22:30:10 2010 +0800
> @@ -47,6 +47,9 @@
>
> unsigned long intr_info;
> unsigned long error_code;
> +
> + u64 geptp;
> + struct vept *vept;
> };
>
> asmlinkage void vmx_nest_switch_mode(void);
> @@ -64,6 +67,8 @@
> int vmx_nest_handle_vmresume(struct cpu_user_regs *regs);
> int vmx_nest_handle_vmlaunch(struct cpu_user_regs *regs);
>
> +int vmx_nest_handle_invept(struct cpu_user_regs *regs);
> +
> void vmx_nest_update_exec_control(struct vcpu *v, unsigned long value);
> void vmx_nest_update_secondary_exec_control(struct vcpu *v,
> unsigned long value);
> @@ -81,4 +86,6 @@
> int vmx_nest_msr_write_intercept(struct cpu_user_regs *regs,
> u64 msr_content);
>
> +int vmx_nest_vept(struct vcpu *v);
> +
> #endif /* __ASM_X86_HVM_NEST_H__ */
> diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/include/asm-x86/hvm/vmx/vept.h
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/xen/include/asm-x86/hvm/vmx/vept.h Thu Apr 22 22:30:10 2010 +0800
> @@ -0,0 +1,10 @@
> +#include <asm/hvm/vmx/vmx.h>
> +
> +
> +struct vept *vept_init(struct vcpu *v);
> +void vept_teardown(struct vept *vept);
> +mfn_t vept_load_eptp(struct vept *vept, u64 eptp);
> +mfn_t vept_invalidate(struct vept *vept, u64 eptp);
> +void vept_invalidate_all(struct vept *vept);
> +int vept_ept_violation(struct vept *vept, u64 eptp,
> + unsigned long qualification, paddr_t addr);
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@xxxxxxxxxxxxxxxxxxx
> http://lists.xensource.com/xen-devel
--
Tim Deegan <Tim.Deegan@xxxxxxxxxx>
Principal Software Engineer, XenServer Engineering
Citrix Systems UK Ltd. (Company #02937203, SL9 0BG)
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|