diff -r 8ba08f2244b2 xen/arch/x86/mm/shadow/common.c --- a/xen/arch/x86/mm/shadow/common.c Wed Dec 05 17:56:13 2007 +0000 +++ b/xen/arch/x86/mm/shadow/common.c Thu Dec 06 09:46:33 2007 +0000 @@ -400,6 +400,10 @@ struct x86_emulate_ops *shadow_init_emul !hvm_fetch_from_guest_virt( sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf))) ? sizeof(sh_ctxt->insn_buf) : 0; + + /* Work out whether we should honour _PAGE_RW */ + ASSERT(!ring_3(regs)); + sh_ctxt->force_write_access == !!(is_hvm_vcpu(v) && !hvm_wp_enabled(v)); return &hvm_shadow_emulator_ops; } diff -r 8ba08f2244b2 xen/arch/x86/mm/shadow/multi.c --- a/xen/arch/x86/mm/shadow/multi.c Wed Dec 05 17:56:13 2007 +0000 +++ b/xen/arch/x86/mm/shadow/multi.c Thu Dec 06 14:14:33 2007 +0000 @@ -60,12 +60,6 @@ * the first l1 of a new pagetable. Should coalesce the flushes to the end, * and if we do flush, re-do the walk. If anything has changed, then * pause all the other vcpus and do the walk *again*. - * - * WP DISABLED - * Consider how to implement having the WP bit of CR0 set to 0. - * Since we need to be able to cause write faults to pagetables, this might - * end up looking like not having the (guest) pagetables present at all in - * HVM guests... * * PSE disabled / PSE36 * We don't support any modes other than PSE enabled, PSE36 disabled. @@ -262,7 +256,8 @@ static uint32_t set_ad_bits(void *guest_ * from any guest PT pages we see, as we will be shadowing them soon * and will rely on the contents' not having changed. * - * Returns 0 for success or non-zero if the walk did not complete. + * Returns 0 for success, or the set of permission bits that we failed on + * if the walk did not complete. * N.B. This is different from the old return code but almost no callers * checked the old return code anyway. */ @@ -2713,13 +2708,14 @@ static int sh_page_fault(struct vcpu *v, paddr_t gpa; struct sh_emulate_ctxt emul_ctxt; struct x86_emulate_ops *emul_ops; - int r; + int r, force_writes = 0; fetch_type_t ft = 0; p2m_type_t p2mt; + uint32_t error_flags; SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u\n", v->domain->domain_id, v->vcpu_id, va, regs->error_code); - + perfc_incr(shadow_fault); // // XXX: Need to think about eventually mapping superpages directly in the @@ -2790,11 +2786,25 @@ static int sh_page_fault(struct vcpu *v, shadow_lock(d); shadow_audit_tables(v); - - if ( guest_walk_tables(v, va, &gw, regs->error_code, 1) != 0 ) - { - perfc_incr(shadow_fault_bail_real_fault); - goto not_a_shadow_fault; + + error_flags = guest_walk_tables(v, va, &gw, regs->error_code, 1); + + if ( error_flags != 0 ) + { + if ( is_hvm_domain(d) + && unlikely(!hvm_wp_enabled(v)) + && !ring_3(regs) + && error_flags == _PAGE_RW ) /* The *only* reason we faulted */ + /* In HVM guests, we force CR0.WP always to be set, so that + * the pagetables are always write-protected. If the guest + * thinks CR0.WP is clear, we must emulate faulting supervisor + * writes to allow the guest to write through read-only PTEs. */ + force_writes = 1; + else + { + perfc_incr(shadow_fault_bail_real_fault); + goto not_a_shadow_fault; + } } /* It's possible that the guest has put pagetables in memory that it has @@ -2882,6 +2892,10 @@ static int sh_page_fault(struct vcpu *v, gpa = guest_walk_to_gpa(&gw); goto mmio; } + + /* Emulate writes for faults caused by our forcing CR0.WP on */ + if ( force_writes ) + goto emulate; perfc_incr(shadow_fault_fixed); d->arch.paging.log_dirty.fault_count++; @@ -3968,110 +3982,205 @@ int sh_remove_l3_shadow(struct vcpu *v, /**************************************************************************/ /* Handling HVM guest writes to pagetables */ -/* Check that the user is allowed to perform this write. - * Returns a mapped pointer to write to, and the mfn it's on, - * or NULL for error. */ -static inline void * emulate_map_dest(struct vcpu *v, - unsigned long vaddr, - struct sh_emulate_ctxt *sh_ctxt, - mfn_t *mfnp) -{ - uint32_t pfec; +/* Translate a VA to an MFN, injecting a page-fault if we fail */ +static mfn_t emulate_gva_to_mfn(struct vcpu *v, + unsigned long vaddr, + struct sh_emulate_ctxt *sh_ctxt) +{ unsigned long gfn; mfn_t mfn; p2m_type_t p2mt; - - /* We don't emulate user-mode writes to page tables */ - if ( ring_3(sh_ctxt->ctxt.regs) ) - return NULL; - - /* Translate the VA, and exit with a page-fault if we fail */ - pfec = PFEC_page_present | PFEC_write_access; + uint32_t pfec = PFEC_page_present; + + /* Translate the VA to a GFN */ + if ( likely(!sh_ctxt->force_write_access) ) + pfec |= PFEC_write_access; gfn = sh_gva_to_gfn(v, vaddr, &pfec); if ( gfn == INVALID_GFN ) { + pfec |= PFEC_write_access; if ( is_hvm_vcpu(v) ) hvm_inject_exception(TRAP_page_fault, pfec, vaddr); else propagate_page_fault(vaddr, pfec); - return NULL; - } - - /* Translate the GFN */ + return _mfn(INVALID_MFN); + } + + /* Translate the GFN to an MFN */ mfn = gfn_to_mfn(v->domain, _gfn(gfn), &p2mt); if ( p2m_is_ram(p2mt) ) { ASSERT(mfn_valid(mfn)); - *mfnp = mfn; v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn); - return sh_map_domain_page(mfn) + (vaddr & ~PAGE_MASK); + return mfn; + } + + return _mfn(INVALID_MFN); +} + +/* Check that the user is allowed to perform this write. + * Returns a mapped pointer to write to, or NULL for error. */ +static void * emulate_map_dest(struct vcpu *v, + unsigned long vaddr, + u32 bytes, + struct sh_emulate_ctxt *sh_ctxt) +{ + unsigned long offset; + void *map = NULL; + + /* We don't emulate user-mode writes to page tables */ + if ( ring_3(sh_ctxt->ctxt.regs) ) + return NULL; + + sh_ctxt->mfn1 = emulate_gva_to_mfn(v, vaddr, sh_ctxt); + if ( !mfn_valid(sh_ctxt->mfn1) ) + return NULL; + + /* Unaligned writes mean probably this isn't a pagetable */ + if ( vaddr & (bytes - 1) ) + sh_remove_shadows(v, sh_ctxt->mfn1, 1, 0 /* Fast, can fail */ ); + + if ( likely(((vaddr + bytes - 1) & PAGE_MASK) == (vaddr & PAGE_MASK)) ) + { + /* Whole write fits on a single page */ + sh_ctxt->mfn2 = _mfn(INVALID_MFN); + map = sh_map_domain_page(sh_ctxt->mfn1) + (vaddr & ~PAGE_MASK); } else - return NULL; -} - -static int safe_not_to_verify_write(mfn_t gmfn, void *dst, void *src, - int bytes) -{ + { + /* Cross-page emulated writes are only supported for HVM guests; + * PV guests ought to know better */ + if ( !is_hvm_vcpu(v) ) + return NULL; + + /* This write crosses a page boundary. Translate the second page */ + sh_ctxt->mfn2 = emulate_gva_to_mfn(v, (vaddr + bytes - 1) & PAGE_MASK, + sh_ctxt); + if ( !mfn_valid(sh_ctxt->mfn2) ) + return NULL; + + /* Cross-page writes mean probably not a pagetable */ + sh_remove_shadows(v, sh_ctxt->mfn2, 1, 0 /* Fast, can fail */ ); + + /* Hack: we map the pages into the vcpu's LDT space, since we + * know that we're not going to need the LDT for HVM guests, + * and only HVM guests are allowed unaligned writes. */ + ASSERT(is_hvm_vcpu(v)); + map = (void *)LDT_VIRT_START(v); + offset = l1_linear_offset((unsigned long) map); + l1e_write(&__linear_l1_table[offset], + l1e_from_pfn(mfn_x(sh_ctxt->mfn1), __PAGE_HYPERVISOR)); + l1e_write(&__linear_l1_table[offset + 1], + l1e_from_pfn(mfn_x(sh_ctxt->mfn2), __PAGE_HYPERVISOR)); + flush_tlb_local(); + map += (vaddr & ~PAGE_MASK); + } + #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY) - struct page_info *pg = mfn_to_page(gmfn); - if ( !(pg->shadow_flags & SHF_32) - && ((unsigned long)dst & 7) == 0 ) - { - /* Not shadowed 32-bit: aligned 64-bit writes that leave the - * present bit unset are safe to ignore. */ - if ( (*(u64*)src & _PAGE_PRESENT) == 0 - && (*(u64*)dst & _PAGE_PRESENT) == 0 ) - return 1; - } - else if ( !(pg->shadow_flags & (SHF_PAE|SHF_64)) - && ((unsigned long)dst & 3) == 0 ) - { - /* Not shadowed PAE/64-bit: aligned 32-bit writes that leave the - * present bit unset are safe to ignore. */ - if ( (*(u32*)src & _PAGE_PRESENT) == 0 - && (*(u32*)dst & _PAGE_PRESENT) == 0 ) - return 1; - } -#endif - return 0; -} - + /* Remember if the bottom bit was clear, so we can choose not to run + * the change through the verify code if it's still clear afterwards */ + sh_ctxt->low_bit_was_clear = map != NULL && !(*(u8 *)map & _PAGE_PRESENT); +#endif + + return map; +} + +/* Tidy up after the emulated write: mark pages dirty, verify the new + * contents, and undo the mapping */ +static void emulate_unmap_dest(struct vcpu *v, + void *addr, + u32 bytes, + struct sh_emulate_ctxt *sh_ctxt) +{ + u32 b1 = bytes, b2 = 0, shflags; + + ASSERT(mfn_valid(sh_ctxt->mfn1)); + + /* If we are writing lots of PTE-aligned zeros, might want to unshadow */ + if ( likely(bytes >= 4) + && (*(u32 *)addr == 0) + && ((unsigned long) addr & ((sizeof (guest_intpte_t)) - 1)) == 0 ) + check_for_early_unshadow(v, sh_ctxt->mfn1); + else + reset_early_unshadow(v); + + /* We can avoid re-verifying the page contents after the write if: + * - it was no larger than the PTE type of this pagetable; + * - it was aligned to the PTE boundaries; and + * - _PAGE_PRESENT was clear before and after the write. */ + shflags = mfn_to_page(sh_ctxt->mfn1)->shadow_flags; +#if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY) + if ( sh_ctxt->low_bit_was_clear + && !(*(u8 *)addr & _PAGE_PRESENT) + && ((!(shflags & SHF_32) + /* Not shadowed 32-bit: aligned 64-bit writes that leave + * the present bit unset are safe to ignore. */ + && ((unsigned long)addr & 7) == 0 + && bytes <= 8) + || + (!(shflags & (SHF_PAE|SHF_64)) + /* Not shadowed PAE/64-bit: aligned 32-bit writes that + * leave the present bit unset are safe to ignore. */ + && ((unsigned long)addr & 3) == 0 + && bytes <= 4)) ) + { + /* Writes with this alignment constraint can't possibly cross pages */ + ASSERT(!mfn_valid(sh_ctxt->mfn2)); + } + else +#endif /* SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY */ + { + if ( unlikely(mfn_valid(sh_ctxt->mfn2)) ) + { + /* Validate as two writes, one to each page */ + b1 = PAGE_SIZE - (((unsigned long)addr) & ~PAGE_MASK); + b2 = bytes - b1; + ASSERT(b2 < bytes); + } + if ( likely(b1 > 0) ) + sh_validate_guest_pt_write(v, sh_ctxt->mfn1, addr, b1); + if ( unlikely(b2 > 0) ) + sh_validate_guest_pt_write(v, sh_ctxt->mfn2, addr + b1, b2); + } + + paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn1)); + + if ( unlikely(mfn_valid(sh_ctxt->mfn2)) ) + { + unsigned long offset; + paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn2)); + /* Undo the hacky two-frame contiguous map. */ + ASSERT(((unsigned long) addr & PAGE_MASK) == LDT_VIRT_START(v)); + offset = l1_linear_offset((unsigned long) addr & PAGE_MASK); + l1e_write(&__linear_l1_table[offset], l1e_empty()); + l1e_write(&__linear_l1_table[offset + 1], l1e_empty()); + flush_tlb_all(); + } + else + sh_unmap_domain_page(addr); +} int sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src, u32 bytes, struct sh_emulate_ctxt *sh_ctxt) { - mfn_t mfn; void *addr; - int skip; - - if ( vaddr & (bytes-1) ) + + /* Unaligned writes are only acceptable on HVM */ + if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) ) return X86EMUL_UNHANDLEABLE; - ASSERT(((vaddr & ~PAGE_MASK) + bytes) <= PAGE_SIZE); shadow_lock(v->domain); - - addr = emulate_map_dest(v, vaddr, sh_ctxt, &mfn); + addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt); if ( addr == NULL ) { shadow_unlock(v->domain); return X86EMUL_EXCEPTION; } - skip = safe_not_to_verify_write(mfn, addr, src, bytes); memcpy(addr, src, bytes); - if ( !skip ) sh_validate_guest_pt_write(v, mfn, addr, bytes); - - /* If we are writing zeros to this page, might want to unshadow */ - if ( likely(bytes >= 4) && (*(u32 *)addr == 0) && is_lo_pte(vaddr) ) - check_for_early_unshadow(v, mfn); - else - reset_early_unshadow(v); - - paging_mark_dirty(v->domain, mfn_x(mfn)); - - sh_unmap_domain_page(addr); + + emulate_unmap_dest(v, addr, bytes, sh_ctxt); shadow_audit_tables(v); shadow_unlock(v->domain); return X86EMUL_OKAY; @@ -4082,25 +4191,22 @@ sh_x86_emulate_cmpxchg(struct vcpu *v, u unsigned long old, unsigned long new, unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt) { - mfn_t mfn; void *addr; unsigned long prev; - int rv = X86EMUL_OKAY, skip; - - ASSERT(bytes <= sizeof(unsigned long)); + int rv = X86EMUL_OKAY; + + /* Unaligned writes are only acceptable on HVM */ + if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) ) + return X86EMUL_UNHANDLEABLE; + shadow_lock(v->domain); - if ( vaddr & (bytes-1) ) - return X86EMUL_UNHANDLEABLE; - - addr = emulate_map_dest(v, vaddr, sh_ctxt, &mfn); + addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt); if ( addr == NULL ) { shadow_unlock(v->domain); return X86EMUL_EXCEPTION; } - - skip = safe_not_to_verify_write(mfn, &new, &old, bytes); switch ( bytes ) { @@ -4113,26 +4219,14 @@ sh_x86_emulate_cmpxchg(struct vcpu *v, u prev = ~old; } - if ( prev == old ) - { - if ( !skip ) sh_validate_guest_pt_write(v, mfn, addr, bytes); - } - else + if ( prev != old ) rv = X86EMUL_CMPXCHG_FAILED; SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx" " wanted %#lx now %#lx bytes %u\n", vaddr, prev, old, new, *(unsigned long *)addr, bytes); - /* If we are writing zeros to this page, might want to unshadow */ - if ( likely(bytes >= 4) && (*(u32 *)addr == 0) && is_lo_pte(vaddr) ) - check_for_early_unshadow(v, mfn); - else - reset_early_unshadow(v); - - paging_mark_dirty(v->domain, mfn_x(mfn)); - - sh_unmap_domain_page(addr); + emulate_unmap_dest(v, addr, bytes, sh_ctxt); shadow_audit_tables(v); shadow_unlock(v->domain); return rv; @@ -4144,17 +4238,17 @@ sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long new_lo, unsigned long new_hi, struct sh_emulate_ctxt *sh_ctxt) { - mfn_t mfn; void *addr; u64 old, new, prev; - int rv = X86EMUL_OKAY, skip; - - if ( vaddr & 7 ) + int rv = X86EMUL_OKAY; + + /* Unaligned writes are only acceptable on HVM */ + if ( (vaddr & 7) && !is_hvm_vcpu(v) ) return X86EMUL_UNHANDLEABLE; shadow_lock(v->domain); - addr = emulate_map_dest(v, vaddr, sh_ctxt, &mfn); + addr = emulate_map_dest(v, vaddr, 8, sh_ctxt); if ( addr == NULL ) { shadow_unlock(v->domain); @@ -4163,25 +4257,12 @@ sh_x86_emulate_cmpxchg8b(struct vcpu *v, old = (((u64) old_hi) << 32) | (u64) old_lo; new = (((u64) new_hi) << 32) | (u64) new_lo; - skip = safe_not_to_verify_write(mfn, &new, &old, 8); prev = cmpxchg(((u64 *)addr), old, new); - if ( prev == old ) - { - if ( !skip ) sh_validate_guest_pt_write(v, mfn, addr, 8); - } - else + if ( prev != old ) rv = X86EMUL_CMPXCHG_FAILED; - /* If we are writing zeros to this page, might want to unshadow */ - if ( *(u32 *)addr == 0 ) - check_for_early_unshadow(v, mfn); - else - reset_early_unshadow(v); - - paging_mark_dirty(v->domain, mfn_x(mfn)); - - sh_unmap_domain_page(addr); + emulate_unmap_dest(v, addr, 8, sh_ctxt); shadow_audit_tables(v); shadow_unlock(v->domain); return rv; diff -r 8ba08f2244b2 xen/arch/x86/mm/shadow/private.h --- a/xen/arch/x86/mm/shadow/private.h Wed Dec 05 17:56:13 2007 +0000 +++ b/xen/arch/x86/mm/shadow/private.h Thu Dec 06 09:46:33 2007 +0000 @@ -429,13 +429,6 @@ int shadow_cmpxchg_guest_entry(struct vc #undef pagetable_from_page #define pagetable_from_page(pg) pagetable_from_mfn(page_to_mfn(pg)) - -#if GUEST_PAGING_LEVELS >= 3 -# define is_lo_pte(_vaddr) (((_vaddr)&0x4)==0) -#else -# define is_lo_pte(_vaddr) (1) -#endif - static inline int sh_mfn_is_a_page_table(mfn_t gmfn) { @@ -664,14 +657,26 @@ struct sh_emulate_ctxt { struct sh_emulate_ctxt { struct x86_emulate_ctxt ctxt; - /* [HVM] Cache of up to 31 bytes of instruction. */ + /* Cache of up to 31 bytes of instruction. */ uint8_t insn_buf[31]; uint8_t insn_buf_bytes; unsigned long insn_buf_eip; - /* [HVM] Cache of segment registers already gathered for this emulation. */ + /* Cache of segment registers already gathered for this emulation. */ unsigned int valid_seg_regs; struct segment_register seg_reg[6]; + + /* MFNs being written to in write/cmpxchg callbacks */ + mfn_t mfn1, mfn2; + + /* Special case for supervisor-mode writes with CR0.WP clear */ + int force_write_access:1; + +#if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY) + /* Special case for avoiding having to verify writes: remember + * whether the old value had its low bit (_PAGE_PRESENT) clear. */ + int low_bit_was_clear:1; +#endif }; struct x86_emulate_ops *shadow_init_emulation( diff -r 8ba08f2244b2 xen/include/asm-x86/hvm/hvm.h --- a/xen/include/asm-x86/hvm/hvm.h Wed Dec 05 17:56:13 2007 +0000 +++ b/xen/include/asm-x86/hvm/hvm.h Thu Dec 06 09:46:33 2007 +0000 @@ -152,6 +152,8 @@ u64 hvm_get_guest_tsc(struct vcpu *v); #define hvm_paging_enabled(v) \ (!!((v)->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG)) +#define hvm_wp_enabled(v) \ + (!!((v)->arch.hvm_vcpu.guest_cr[0] & X86_CR0_WP)) #define hvm_pae_enabled(v) \ (hvm_paging_enabled(v) && ((v)->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PAE)) #define hvm_nx_enabled(v) \