WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

Re: [Xen-devel] [PATCH 15/17] vmx: nest: virtual ept for nested

To: Qing He <qing.he@xxxxxxxxx>
Subject: Re: [Xen-devel] [PATCH 15/17] vmx: nest: virtual ept for nested
From: Tim Deegan <Tim.Deegan@xxxxxxxxxx>
Date: Thu, 20 May 2010 13:21:51 +0100
Cc: "xen-devel@xxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxx>
Delivery-date: Thu, 20 May 2010 05:22:51 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
In-reply-to: <1271929289-18572-16-git-send-email-qing.he@xxxxxxxxx>
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
References: <1271929289-18572-1-git-send-email-qing.he@xxxxxxxxx> <1271929289-18572-16-git-send-email-qing.he@xxxxxxxxx>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
User-agent: Mutt/1.5.18 (2008-05-17)
At 10:41 +0100 on 22 Apr (1271932887), Qing He wrote:
> This patch adds virtual ept capability to L1.
> It's implemented as a simple per vCPU vTLB like component
> independent to domain wide p2m.
> 
> Signed-off-by: Qing He <qing.he@xxxxxxxxx>

> diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/arch/x86/hvm/vmx/nest.c
> --- a/xen/arch/x86/hvm/vmx/nest.c       Thu Apr 22 22:30:09 2010 +0800
> +++ b/xen/arch/x86/hvm/vmx/nest.c       Thu Apr 22 22:30:10 2010 +0800
> @@ -26,6 +26,7 @@
>  #include <asm/hvm/vmx/vmx.h>
>  #include <asm/hvm/vmx/vvmcs.h>
>  #include <asm/hvm/vmx/nest.h>
> +#include <asm/hvm/vmx/vept.h>
> 
>  /*
>   * VMX instructions support functions
> @@ -295,6 +296,9 @@
>      __vmptrld(virt_to_maddr(nest->hvmcs));
>      v->arch.hvm_vmx.launched = 0;
> 
> +    nest->geptp = 0;
> +    nest->vept = vept_init(v);
> +
>      vmreturn(regs, VMSUCCEED);
> 
>  out:
> @@ -313,6 +317,9 @@
>      if ( unlikely(!nest->guest_vmxon_pa) )
>          goto invalid_op;
> 
> +    vept_teardown(nest->vept);
> +    nest->vept = 0;
> +
>      nest->guest_vmxon_pa = 0;
>      __vmpclear(virt_to_maddr(nest->svmcs));
> 
> @@ -529,6 +536,67 @@
>      return vmx_nest_handle_vmresume(regs);
>  }
> 
> +int vmx_nest_handle_invept(struct cpu_user_regs *regs)
> +{
> +    struct vcpu *v = current;
> +    struct vmx_inst_decoded decode;
> +    struct vmx_nest_struct *nest = &v->arch.hvm_vmx.nest;
> +    mfn_t mfn;
> +    u64 eptp;
> +    int type;
> +
> +    if ( unlikely(!nest->guest_vmxon_pa) )
> +        goto invalid_op;
> +
> +    decode_vmx_inst(regs, &decode);
> +
> +    hvm_copy_from_guest_virt(&eptp, decode.mem, sizeof(eptp), 0);
> +    type = reg_read(regs, decode.reg2);

Needs error handling like the other new instructions. 

> +    /* TODO: physical invept on other cpus */

?

> +    switch ( type )
> +    {
> +    case 1:
> +        mfn = vept_invalidate(nest->vept, eptp);
> +        if ( eptp == nest->geptp )
> +            nest->geptp = 0;
> +
> +        if ( __mfn_valid(mfn_x(mfn)) )
> +            __invept(1, mfn_x(mfn) << PAGE_SHIFT | (eptp & 0xfff), 0);
> +        break;
> +    case 2:
> +        vept_invalidate_all(nest->vept);
> +        nest->geptp = 0;
> +        break;
> +    default:
> +        gdprintk(XENLOG_ERR, "nest: unsupported invept type %d\n", type);
> +        break;
> +    }
> +
> +    vmreturn(regs, VMSUCCEED);
> +
> +    return X86EMUL_OKAY;
> +
> +invalid_op:
> +    hvm_inject_exception(TRAP_invalid_op, 0, 0);
> +    return X86EMUL_EXCEPTION;
> +}
> +
> +int vmx_nest_vept(struct vcpu *v)
> +{
> +    struct vmx_nest_struct *nest = &v->arch.hvm_vmx.nest;
> +    int r = 0;
> +
> +    if ( paging_mode_hap(v->domain) &&
> +         (__get_vvmcs(nest->vvmcs, CPU_BASED_VM_EXEC_CONTROL) &
> +          CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
> +         (__get_vvmcs(nest->vvmcs, SECONDARY_VM_EXEC_CONTROL) &
> +          SECONDARY_EXEC_ENABLE_EPT) )
> +        r = 1;
> +
> +    return r;
> +}
> +
>  /*
>   * Nested VMX context switch
>   */
> @@ -739,7 +807,14 @@
>      vvmcs_to_shadow(nest->vvmcs, CR0_GUEST_HOST_MASK);
>      vvmcs_to_shadow(nest->vvmcs, CR4_GUEST_HOST_MASK);
> 
> -    /* TODO: PDPTRs for nested ept */
> +    if ( vmx_nest_vept(v) )
> +    {
> +        vvmcs_to_shadow(nest->vvmcs, GUEST_PDPTR0);
> +        vvmcs_to_shadow(nest->vvmcs, GUEST_PDPTR1);
> +        vvmcs_to_shadow(nest->vvmcs, GUEST_PDPTR2);
> +        vvmcs_to_shadow(nest->vvmcs, GUEST_PDPTR3);
> +    }
> +
>      /* TODO: CR3 target control */
>  }
> 
> @@ -787,14 +862,32 @@
>      }
>  #endif
> 
> +
> +    /* loading EPT_POINTER for L2 */
> +    if ( vmx_nest_vept(v) )
> +    {
> +        u64 geptp;
> +        mfn_t mfn;
> +
> +        geptp = __get_vvmcs(nest->vvmcs, EPT_POINTER);
> +        if ( geptp != nest->geptp )
> +        {
> +            mfn = vept_load_eptp(nest->vept, geptp);

What if vept_load_eptp() returns INVALID_MFN?

> +            nest->geptp = geptp;
> +
> +            __vmwrite(EPT_POINTER, (mfn_x(mfn) << PAGE_SHIFT) | 0x1e);
> +#ifdef __i386__
> +            __vmwrite(EPT_POINTER_HIGH, (mfn_x(mfn) << PAGE_SHIFT) >> 32);
> +#endif
> +        }
> +    }
> +
>      regs->rip = __get_vvmcs(nest->vvmcs, GUEST_RIP);
>      regs->rsp = __get_vvmcs(nest->vvmcs, GUEST_RSP);
>      regs->rflags = __get_vvmcs(nest->vvmcs, GUEST_RFLAGS);
> 
>      /* updating host cr0 to sync TS bit */
>      __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
> -
> -    /* TODO: EPT_POINTER */
>  }
> 
>  static void sync_vvmcs_guest_state(struct vmx_nest_struct *nest)
> @@ -1064,8 +1157,26 @@
>          break;
>      }
> 
> +    case EXIT_REASON_EPT_VIOLATION:
> +    {
> +        unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
> +        paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
> +#ifdef __i386__
> +        gpa |= (paddr_t)__vmread(GUEST_PHYSICAL_ADDRESS_HIGH) << 32;
> +#endif
> +        if ( vmx_nest_vept(v) )
> +        {
> +            if ( !vept_ept_violation(nest->vept, nest->geptp,
> +                     exit_qualification, gpa) )
> +                bypass_l0 = 1;
> +            else
> +                nest->vmexit_pending = 1;

Since bypass_l0 is set from vmexit_pending() here it looks like it's
always going to be set.  Does that mean we never handle a real EPT
violation at L0?  I would expect there to be three possible outcomes
here: give the violation to L1, give it to L0, or fix it in the vept and
discard it.

> +        }
> +
> +        break;
> +    }
> +
>      case EXIT_REASON_WBINVD:
> -    case EXIT_REASON_EPT_VIOLATION:
>      case EXIT_REASON_EPT_MISCONFIG:
>      case EXIT_REASON_EXTERNAL_INTERRUPT:
>          /* pass to L0 handler */
> @@ -1229,11 +1340,14 @@
>          data = (data << 32) | eax;
>          break;
>      case MSR_IA32_VMX_PROCBASED_CTLS:
> +        mask = paging_mode_hap(current->domain)?
> +                   0: CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
> +
>          rdmsr(regs->ecx, eax, edx);
>  #define REMOVED_EXEC_CONTROL_CAP (CPU_BASED_TPR_SHADOW \
> -            | CPU_BASED_ACTIVATE_MSR_BITMAP            \
> -            | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
> +            | CPU_BASED_ACTIVATE_MSR_BITMAP)
>          data = edx & ~REMOVED_EXEC_CONTROL_CAP;
> +        data = edx & ~mask;
>          data = (data << 32) | eax;
>          break;
>      case MSR_IA32_VMX_EXIT_CTLS:
> @@ -1254,12 +1368,20 @@
>          data = (data << 32) | eax;
>          break;
>      case MSR_IA32_VMX_PROCBASED_CTLS2:
> -        mask = 0;
> +        mask = paging_mode_hap(current->domain)?
> +                   SECONDARY_EXEC_ENABLE_EPT : 0;
> 
>          rdmsr(regs->ecx, eax, edx);
>          data = edx & mask;
>          data = (data << 32) | eax;
>          break;
> +    case MSR_IA32_VMX_EPT_VPID_CAP:
> +        rdmsr(regs->ecx, eax, edx);
> +#define REMOVED_EPT_VPID_CAP_HIGH   ( 1 | 1<<8 | 1<<9 | 1<<10 | 1<<11 )
> +#define REMOVED_EPT_VPID_CAP_LOW    ( 1<<16 | 1<<17 | 1<<26 )
> +        data = edx & ~REMOVED_EPT_VPID_CAP_HIGH;
> +        data = (data << 32) | (eax & ~REMOVED_EPT_VPID_CAP_LOW);
> +        break;
> 
>      /* pass through MSRs */
>      case IA32_FEATURE_CONTROL_MSR:
> diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/arch/x86/hvm/vmx/vept.c
> --- /dev/null   Thu Jan 01 00:00:00 1970 +0000
> +++ b/xen/arch/x86/hvm/vmx/vept.c       Thu Apr 22 22:30:10 2010 +0800
> @@ -0,0 +1,574 @@
> +/*
> + * vept.c: virtual EPT for nested virtualization
> + *
> + * Copyright (c) 2010, Intel Corporation.
> + * Author: Qing He <qing.he@xxxxxxxxx>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along 
> with
> + * this program; if not, write to the Free Software Foundation, Inc., 59 
> Temple
> + * Place - Suite 330, Boston, MA 02111-1307 USA.
> + *
> + */
> +
> +#include <xen/config.h>
> +#include <xen/types.h>
> +#include <xen/list.h>
> +#include <xen/mm.h>
> +#include <xen/paging.h>
> +#include <xen/domain_page.h>
> +#include <xen/sched.h>
> +#include <asm/page.h>
> +#include <xen/numa.h>
> +#include <asm/hvm/vmx/vmx.h>
> +#include <asm/hvm/vmx/vept.h>
> +
> +#undef mfn_to_page
> +#define mfn_to_page(_m) __mfn_to_page(mfn_x(_m))
> +#undef mfn_valid
> +#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
> +#undef page_to_mfn
> +#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
> +
> +/*
> + * This virtual EPT implementation is independent to p2m facility
> + * and has some different characteristics. It works in a similar
> + * way as shadow page table (guest table and host table composition),
> + * but is per-vcpu, and of vTLB style
> + *   - per vCPU so no lock is required

What happens when dom0 changes domU's p2m table?  Don't you need to
shoot down existing vEPT tables from a foreign CPU?

> + *   - vTLB style signifies honoring all invalidations, and not
> + * write protection. Unlike ordinary page table, since EPT updates
> + * and invalidations are minimal in a well written VMM, overhead
> + * is also minimized.
> + *
> + * The physical root is loaded directly to L2 sVMCS, without entering
> + * any other host controls. Multiple `cache slots' are maintained
> + * for multiple guest EPTPs, with simple LRU replacement.
> + *
> + * One of the limitations so far, is that it doesn't work with
> + * L0 emulation code, so L1 p2m_mmio_direct on top of L0 p2m_mmio_dm
> + * is not supported as for now.

Is this something you intend to fix before we check it in?

> + */
> +
> +#define VEPT_MAX_SLOTS 8
> +#define VEPT_ALLOCATION_SIZE 512
> +
> +struct vept_slot {
> +    u64               eptp;   /* guest eptp */
> +    mfn_t             root;   /* root of phys table */
> +    struct list_head  list;
> +
> +    struct page_list_head page_list;
> +};
> +
> +struct vept {
> +    struct list_head   used_slots; /* lru: new->tail, old->head */
> +    struct list_head   free_slots;
> +
> +    int                total_pages;
> +    int                free_pages;
> +    struct page_list_head freelist;
> +
> +    struct vcpu       *vcpu;
> +};
> +
> +
> +static struct vept_slot *__get_eptp_slot(struct vept *vept, u64 geptp)
> +{
> +    struct vept_slot *slot, *tmp;
> +
> +    list_for_each_entry_safe( slot, tmp, &vept->used_slots, list )
> +        if ( slot->eptp == geptp )
> +            return slot;
> +
> +    return NULL;
> +}
> +
> +static struct vept_slot *get_eptp_slot(struct vept *vept, u64 geptp)
> +{
> +    struct vept_slot *slot;
> +
> +    slot = __get_eptp_slot(vept, geptp);
> +    if ( slot != NULL )
> +        list_del(&slot->list);
> +
> +    return slot;
> +}
> +
> +static void __clear_slot(struct vept *vept, struct vept_slot *slot)
> +{
> +    struct page_info *pg;
> +
> +    slot->eptp = 0;
> +
> +    while ( !page_list_empty(&slot->page_list) )
> +    {
> +        pg = page_list_remove_head(&slot->page_list);
> +        page_list_add_tail(pg, &vept->freelist);
> +
> +        vept->free_pages++;
> +    }
> +}
> +
> +static struct vept_slot *get_free_slot(struct vept *vept)
> +{
> +    struct vept_slot *slot = NULL;
> +
> +    if ( !list_empty(&vept->free_slots) )
> +    {
> +        slot = list_entry(vept->free_slots.next, struct vept_slot, list);
> +        list_del(&slot->list);
> +    }
> +    else if ( !list_empty(&vept->used_slots) )
> +    {
> +        slot = list_entry(vept->used_slots.next, struct vept_slot, list);
> +        list_del(&slot->list);
> +        __clear_slot(vept, slot);
> +    }
> +
> +    return slot;
> +}
> +
> +static void clear_all_slots(struct vept *vept)
> +{
> +    struct vept_slot *slot, *tmp;
> +
> +    list_for_each_entry_safe( slot, tmp, &vept->used_slots, list )
> +    {
> +        list_del(&slot->list);
> +        __clear_slot(vept, slot);
> +        list_add_tail(&slot->list, &vept->free_slots);
> +    }
> +}
> +
> +static int free_some_pages(struct vept *vept, struct vept_slot *curr)
> +{
> +    struct vept_slot *slot;
> +    int r = 0;
> +
> +    if ( !list_empty(&vept->used_slots) )
> +    {
> +        slot = list_entry(vept->used_slots.next, struct vept_slot, list);
> +        if ( slot != curr )
> +        {
> +            list_del(&slot->list);
> +            __clear_slot(vept, slot);
> +            list_add_tail(&slot->list, &vept->free_slots);
> +
> +            r = 1;
> +        }
> +    }
> +
> +    return r;
> +}
> +
> +struct vept *vept_init(struct vcpu *v)
> +{
> +    struct vept *vept;
> +    struct vept_slot *slot;
> +    struct page_info *pg;
> +    int i;
> +
> +    vept = xmalloc(struct vept);
> +    if ( vept == NULL )
> +        goto out;
> +
> +    memset(vept, 0, sizeof(*vept));
> +    vept->vcpu = v;
> +
> +    INIT_PAGE_LIST_HEAD(&vept->freelist);
> +    INIT_LIST_HEAD(&vept->used_slots);
> +    INIT_LIST_HEAD(&vept->free_slots);
> +
> +    for ( i = 0; i < VEPT_MAX_SLOTS; i++ )
> +    {
> +        slot = xmalloc(struct vept_slot);
> +        if ( slot == NULL )
> +            break;
> +
> +        memset(slot, 0, sizeof(*slot));
> +
> +        INIT_LIST_HEAD(&slot->list);
> +        INIT_PAGE_LIST_HEAD(&slot->page_list);
> +
> +        list_add(&slot->list, &vept->free_slots);
> +    }
> +
> +    for ( i = 0; i < VEPT_ALLOCATION_SIZE; i++ )

Why a fixed 2MB allocation?  What if your nested domains are very large?

> +    {
> +        pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(v->domain)));

Shouldn't this be allocated from the paging pool like other EPT memory?

> +        if ( pg == NULL )
> +            break;

Return an error?

> +        page_list_add_tail(pg, &vept->freelist);
> +        vept->total_pages++;
> +        vept->free_pages++;
> +    }
> +
> + out:
> +    return vept;
> +}
> +
> +void vept_teardown(struct vept *vept)
> +{
> +    struct page_info *pg;
> +    struct vept_slot *slot, *tmp;
> +
> +    clear_all_slots(vept);
> +
> +    while ( !page_list_empty(&vept->freelist) )
> +    {
> +        pg = page_list_remove_head(&vept->freelist);
> +        free_domheap_page(pg);
> +        vept->free_pages++;
> +        vept->total_pages++;
> +    }
> +
> +    list_for_each_entry_safe( slot, tmp, &vept->free_slots, list )
> +        xfree(slot);
> +
> +    xfree(vept);
> +}
> +
> +mfn_t vept_load_eptp(struct vept *vept, u64 geptp)
> +{
> +    struct page_info *pg;
> +    struct vept_slot *slot;
> +    mfn_t mfn = _mfn(INVALID_MFN);
> +    void *addr;
> +
> +    ASSERT(vept->vcpu == current);
> +
> +    slot = get_eptp_slot(vept, geptp);
> +    if ( slot == NULL )
> +    {
> +        slot = get_free_slot(vept);
> +        if ( unlikely(slot == NULL) )
> +        {
> +            gdprintk(XENLOG_ERR, "nest: can't get free slot\n");
> +            return mfn;
> +        }
> +
> +        while ( !vept->free_pages )
> +            if ( !free_some_pages(vept, slot) )
> +            {
> +                slot->eptp = 0;
> +                list_add_tail(&slot->list, &vept->free_slots);
> +                gdprintk(XENLOG_ERR, "nest: vept no free pages\n");
> +
> +                return mfn;
> +            }
> +
> +        vept->free_pages--;
> +        pg = page_list_remove_head(&vept->freelist);
> +
> +        mfn = page_to_mfn(pg);
> +        addr = map_domain_page(mfn_x(mfn));
> +        clear_page(addr);
> +        unmap_domain_page(addr);
> +        page_list_add_tail(pg, &slot->page_list);
> +        slot->eptp = geptp;
> +        slot->root = mfn;
> +    }
> +
> +    mfn = slot->root;
> +    list_add_tail(&slot->list, &vept->used_slots);
> +
> +    return mfn;
> +}
> +
> +mfn_t vept_invalidate(struct vept *vept, u64 geptp)
> +{
> +    struct vept_slot *slot;
> +    mfn_t mfn = _mfn(INVALID_MFN);
> +
> +    ASSERT(vept->vcpu == current);
> +
> +    slot = get_eptp_slot(vept, geptp);
> +    if ( slot != NULL )
> +    {
> +        mfn = slot->root;
> +        __clear_slot(vept, slot);
> +        list_add_tail(&slot->list, &vept->free_slots);
> +    }
> +
> +    return mfn;
> +}
> +
> +void vept_invalidate_all(struct vept *vept)
> +{
> +    ASSERT(vept->vcpu == current);
> +
> +    clear_all_slots(vept);
> +}
> +
> +/*
> + * guest EPT walk and EPT violation
> + */
> +struct ept_walk {
> +    unsigned long gfn;
> +    unsigned long gfn_remainder;
> +    ept_entry_t l4e, l3e, l2e, l1e;
> +    mfn_t l4mfn, l3mfn, l2mfn, l1mfn;
> +    int sp;
> +};
> +typedef struct ept_walk ept_walk_t;
> +
> +#define GEPT_NORMAL_PAGE  0
> +#define GEPT_SUPER_PAGE   1
> +#define GEPT_NOT_PRESENT  2
> +static int guest_ept_next_level(struct vcpu *v, ept_entry_t **table,
> +               unsigned long *gfn_remainder, int level, u32 *ar,
> +               ept_entry_t *entry, mfn_t *next_mfn)
> +{
> +    int index;
> +    ept_entry_t *ept_entry;
> +    ept_entry_t *next;
> +    p2m_type_t p2mt;
> +    int rc = GEPT_NORMAL_PAGE;
> +    mfn_t mfn;
> +
> +    index = *gfn_remainder >> (level * EPT_TABLE_ORDER);
> +
> +    ept_entry = (*table) + index;
> +    *entry = *ept_entry;
> +    *ar &= entry->epte & 0x7;
> +
> +    *gfn_remainder &= (1UL << (level * EPT_TABLE_ORDER)) - 1;
> +
> +    if ( !(ept_entry->epte & 0x7) )
> +        rc = GEPT_NOT_PRESENT;
> +    else if ( ept_entry->sp_avail )
> +        rc = GEPT_SUPER_PAGE;
> +    else
> +    {
> +        mfn = gfn_to_mfn(v->domain, ept_entry->mfn, &p2mt);
> +        if ( !p2m_is_ram(p2mt) )
> +            return GEPT_NOT_PRESENT;
> +
> +        if ( next_mfn )
> +        {
> +            next = map_domain_page(mfn_x(mfn));
> +            unmap_domain_page(*table);
> +
> +            *table = next;
> +            *next_mfn = mfn;
> +        }
> +    }
> +
> +    return rc;
> +}
> +
> +static u32 guest_walk_ept(struct vcpu *v, ept_walk_t *gw,
> +                          u64 geptp, u64 ggpa)
> +{
> +    ept_entry_t *table;
> +    p2m_type_t p2mt;
> +    int rc;
> +    u32 ar = 0x7;
> +
> +    unsigned long gfn = (unsigned long) (ggpa >> PAGE_SHIFT);
> +    unsigned long gfn_remainder = gfn;
> +
> +    memset(gw, 0, sizeof(*gw));
> +    gw->gfn = gfn;
> +    gw->sp = 0;
> +
> +    gw->l4mfn = gfn_to_mfn(v->domain, geptp >> PAGE_SHIFT, &p2mt);
> +    if ( !p2m_is_ram(p2mt) )
> +        return 0;
> +
> +    table = map_domain_page(mfn_x(gw->l4mfn));
> +
> +    rc = guest_ept_next_level(v, &table, &gfn_remainder, 3, &ar,
> +                              &gw->l4e, &gw->l3mfn);
> +
> +    if ( rc )
> +        goto out;
> +
> +    rc = guest_ept_next_level(v, &table, &gfn_remainder, 2, &ar,
> +                              &gw->l3e, &gw->l2mfn);
> +
> +    if ( rc == GEPT_SUPER_PAGE )
> +        gw->sp = 2;
> +    if ( rc )
> +        goto out;
> +
> +    rc = guest_ept_next_level(v, &table, &gfn_remainder, 1, &ar,
> +                              &gw->l2e, &gw->l1mfn);
> +
> +    if ( rc == GEPT_SUPER_PAGE )
> +        gw->sp = 1;
> +    if ( rc )
> +        goto out;
> +
> +    rc = guest_ept_next_level(v, &table, &gfn_remainder, 0, &ar,
> +                              &gw->l1e, NULL);
> +
> + out:
> +    gw->gfn_remainder = gfn_remainder;
> +    unmap_domain_page(*table);
> +    return ar;
> +}
> +
> +static void epte_set_ar_bits(ept_entry_t *entry, unsigned long ar)
> +{
> +    entry->epte &= ~0x7f;
> +    entry->epte |= ar & 0x7f;
> +}
> +
> +static int shadow_ept_next_level(struct vept *vept, struct vept_slot *slot,
> +                       ept_entry_t **table, unsigned long *gfn_remainder,
> +                       int level, u32 *ar, ept_entry_t gentry)
> +{
> +    int index;
> +    ept_entry_t *sentry;
> +    ept_entry_t *next;
> +    mfn_t mfn;
> +    struct page_info *pg;
> +
> +    index = *gfn_remainder >> (level * EPT_TABLE_ORDER);
> +
> +    sentry = (*table) + index;
> +    *ar = sentry->epte & 0x7;
> +
> +    *gfn_remainder &= (1UL << (level * EPT_TABLE_ORDER)) - 1;
> +
> +    if ( !(sentry->epte & 0x7) )
> +    {
> +        while ( !vept->free_pages )
> +            if ( !free_some_pages(vept, slot) )
> +            {
> +                gdprintk(XENLOG_ERR, "nest: vept no free pages\n");
> +                return 0;
> +            }
> +
> +        vept->free_pages--;
> +        pg = page_list_remove_head(&vept->freelist);
> +        page_list_add_tail(pg, &slot->page_list);
> +        mfn = page_to_mfn(pg);
> +        next = map_domain_page(mfn_x(mfn));
> +        clear_page(next);
> +
> +        sentry->mfn = mfn_x(mfn);
> +    }
> +    else
> +    {
> +        next = map_domain_page(sentry->mfn);
> +    }
> +
> +    epte_set_ar_bits(sentry, gentry.epte);
> +
> +    unmap_domain_page(*table);
> +    *table = next;
> +
> +    return 1;
> +}
> +
> +int vept_ept_violation(struct vept *vept, u64 geptp,
> +                       unsigned long qualification, paddr_t addr)
> +{
> +    ept_walk_t gw;
> +    struct vept_slot *slot;
> +    ept_entry_t *table, *gept;
> +    ept_entry_t *sentry, *gentry;
> +    u32 old_entry, sp_ar = 0;
> +    p2m_type_t p2mt;
> +    unsigned long mfn_start = 0;
> +    unsigned long gfn_remainder;
> +    int rc, i;
> +
> +    ASSERT(vept->vcpu == current);
> +
> +    slot = __get_eptp_slot(vept, geptp);
> +    if ( unlikely(slot == NULL) )
> +        return 0;
> +
> +    rc = guest_walk_ept(vept->vcpu, &gw, geptp, addr);
> +
> +    if ( !(rc & (qualification & 0x7)) )    /* inject to guest */
> +        return 1;
> +
> +    if ( gw.sp == 2 )  /* 1G */
> +    {
> +        sp_ar = gw.l3e.epte & 0x7;
> +        mfn_start = gw.l3e.mfn +
> +                    (gw.gfn_remainder & (~(1 << EPT_TABLE_ORDER) - 1));
> +    }
> +    if ( gw.sp == 1 )  /* 2M */
> +    {
> +        sp_ar = gw.l2e.epte & 0x7;
> +        mfn_start = gw.l2e.mfn;
> +    }
> +    else
> +        mfn_start = 0;
> +
> +    table = map_domain_page(mfn_x(slot->root));
> +    gfn_remainder = gw.gfn;
> +
> +    shadow_ept_next_level(vept, slot, &table, &gfn_remainder, 3,
> +                          &old_entry, gw.l4e);

What if shadow_ept_next_level() returns 0 ?

> +    shadow_ept_next_level(vept, slot, &table, &gfn_remainder, 2,
> +                          &old_entry, gw.l3e);

Ditto

> +    shadow_ept_next_level(vept, slot, &table, &gfn_remainder, 1,
> +                          &old_entry, (gw.sp == 2) ? gw.l3e : gw.l2e);

Ditto

> +    /* if l1p is just allocated, do a full prefetch */
> +    if ( !old_entry && !gw.sp )
> +    {
> +        gept = map_domain_page(mfn_x(gw.l1mfn));
> +        for ( i = 0; i < 512; i++ )
> +        {
> +            gentry = gept + i;
> +            sentry = table + i;
> +            if ( gentry->epte & 0x7 )
> +            {
> +                sentry->mfn = mfn_x(gfn_to_mfn_guest(vept->vcpu->domain,
> +                                        gentry->mfn, &p2mt));
> +                epte_set_ar_bits(sentry, gentry->epte);
> +            }
> +            else
> +                sentry->epte = 0;
> +        }
> +        unmap_domain_page(gept);
> +    }
> +    else if ( !old_entry && gw.sp )
> +    {
> +        for ( i = 0; i < 512; i++ )
> +        {
> +            sentry = table + i;
> +            sentry->mfn = mfn_x(gfn_to_mfn_guest(vept->vcpu->domain,
> +                                    mfn_start + i, &p2mt));
> +            epte_set_ar_bits(sentry, sp_ar);
> +        }
> +    }
> +    else if ( old_entry && !gw.sp )
> +    {
> +        i = gw.gfn & ((1 << EPT_TABLE_ORDER) - 1);
> +        sentry = table + i;
> +        sentry->mfn = mfn_x(gfn_to_mfn_guest(vept->vcpu->domain,
> +                                gw.l1e.mfn, &p2mt));
> +        epte_set_ar_bits(sentry, gw.l1e.epte);
> +    }
> +    else    // old_entry && gw.sp
> +    {
> +        i = gw.gfn & ((1 << EPT_TABLE_ORDER) - 1);
> +        sentry = table + i;
> +        sentry->mfn = mfn_x(gfn_to_mfn_guest(vept->vcpu->domain,
> +                                mfn_start + i, &p2mt));
> +        epte_set_ar_bits(sentry, sp_ar);
> +    }
> +
> +    unmap_domain_page(table);
> +    return 0;
> +}
> diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/arch/x86/hvm/vmx/vmx.c
> --- a/xen/arch/x86/hvm/vmx/vmx.c        Thu Apr 22 22:30:09 2010 +0800
> +++ b/xen/arch/x86/hvm/vmx/vmx.c        Thu Apr 22 22:30:10 2010 +0800
> @@ -1032,6 +1032,14 @@
>      p2m_type_t p2mt;
>      char *p;
> 
> +    /*
> +     * If in nesting EPT operation, L0 doesn't have the knowledge on
> +     * how to interpret CR3, it's L1's responsibility to provide
> +     * GUEST_PDPTRn, we rely solely on them.
> +     */
> +    if ( v->arch.hvm_vcpu.in_nesting && vmx_nest_vept(v) )
> +        return;
> +
>      /* EPT needs to load PDPTRS into VMCS for PAE. */
>      if ( !hvm_pae_enabled(v) || (v->arch.hvm_vcpu.guest_efer & EFER_LMA) )
>          return;
> @@ -2705,6 +2713,11 @@
>          if ( vmx_nest_handle_vmxon(regs) == X86EMUL_OKAY )
>              __update_guest_eip(inst_len);
>          break;
> +    case EXIT_REASON_INVEPT:
> +        inst_len = __get_instruction_length();
> +        if ( vmx_nest_handle_invept(regs) == X86EMUL_OKAY )
> +            __update_guest_eip(inst_len);
> +        break;
> 
>      case EXIT_REASON_MWAIT_INSTRUCTION:
>      case EXIT_REASON_MONITOR_INSTRUCTION:
> diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/include/asm-x86/hvm/vmx/nest.h
> --- a/xen/include/asm-x86/hvm/vmx/nest.h        Thu Apr 22 22:30:09 2010 +0800
> +++ b/xen/include/asm-x86/hvm/vmx/nest.h        Thu Apr 22 22:30:10 2010 +0800
> @@ -47,6 +47,9 @@
> 
>      unsigned long        intr_info;
>      unsigned long        error_code;
> +
> +    u64                  geptp;
> +    struct vept         *vept;
>  };
> 
>  asmlinkage void vmx_nest_switch_mode(void);
> @@ -64,6 +67,8 @@
>  int vmx_nest_handle_vmresume(struct cpu_user_regs *regs);
>  int vmx_nest_handle_vmlaunch(struct cpu_user_regs *regs);
> 
> +int vmx_nest_handle_invept(struct cpu_user_regs *regs);
> +
>  void vmx_nest_update_exec_control(struct vcpu *v, unsigned long value);
>  void vmx_nest_update_secondary_exec_control(struct vcpu *v,
>                                              unsigned long value);
> @@ -81,4 +86,6 @@
>  int vmx_nest_msr_write_intercept(struct cpu_user_regs *regs,
>                                   u64 msr_content);
> 
> +int vmx_nest_vept(struct vcpu *v);
> +
>  #endif /* __ASM_X86_HVM_NEST_H__ */
> diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/include/asm-x86/hvm/vmx/vept.h
> --- /dev/null   Thu Jan 01 00:00:00 1970 +0000
> +++ b/xen/include/asm-x86/hvm/vmx/vept.h        Thu Apr 22 22:30:10 2010 +0800
> @@ -0,0 +1,10 @@
> +#include <asm/hvm/vmx/vmx.h>
> +
> +
> +struct vept *vept_init(struct vcpu *v);
> +void vept_teardown(struct vept *vept);
> +mfn_t vept_load_eptp(struct vept *vept, u64 eptp);
> +mfn_t vept_invalidate(struct vept *vept, u64 eptp);
> +void vept_invalidate_all(struct vept *vept);
> +int vept_ept_violation(struct vept *vept, u64 eptp,
> +                       unsigned long qualification, paddr_t addr);
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@xxxxxxxxxxxxxxxxxxx
> http://lists.xensource.com/xen-devel

-- 
Tim Deegan <Tim.Deegan@xxxxxxxxxx>
Principal Software Engineer, XenServer Engineering
Citrix Systems UK Ltd.  (Company #02937203, SL9 0BG)

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

<Prev in Thread] Current Thread [Next in Thread>