This patch implements the basic mechanisms to get pagetables out of sync
and back in sync again.
Signed-off-by: Gianluca Guida <gianluca.guida@xxxxxxxxxxxxx>
Signed-off-by: Tim Deegan <tim.deegan@xxxxxxxxxxxxx>
Signed-off-by: George Dunlap <george.dunlap@xxxxxxxxxxxxx>
diff -r 26ecd1f9e128 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/arch/x86/mm.c Fri Jun 20 15:10:08 2008 +0100
@@ -1933,9 +1933,15 @@ int get_page_type(struct page_info *page
{
struct domain *d = page_get_owner(page);
- /* Never allow a shadowed frame to go from type count 0 to 1 */
- if ( d && shadow_mode_enabled(d) )
- shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
+ /* Normally we should never let a page go from type count 0
+ * to type count 1 when it is shadowed. One exception:
+ * out-of-sync shadowed pages are allowed to become
+ * writeable. */
+ if ( d && shadow_mode_enabled(d)
+ && (page->count_info & PGC_page_table)
+ && !((page->shadow_flags & (1u<<29))
+ && type == PGT_writable_page) )
+ shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
ASSERT(!(x & PGT_pae_xen_l2));
if ( (x & PGT_type_mask) != type )
diff -r 26ecd1f9e128 xen/arch/x86/mm/shadow/common.c
--- a/xen/arch/x86/mm/shadow/common.c Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/arch/x86/mm/shadow/common.c Fri Jun 20 15:10:08 2008 +0100
@@ -54,6 +54,10 @@ void shadow_domain_init(struct domain *d
/* Use shadow pagetables for log-dirty support */
paging_log_dirty_init(d, shadow_enable_log_dirty,
shadow_disable_log_dirty, shadow_clean_dirty_bitmap);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ d->arch.paging.shadow.oos_active = 0;
+#endif
}
/* Setup the shadow-specfic parts of a vcpu struct. Note: The most important
@@ -64,6 +68,13 @@ void shadow_domain_init(struct domain *d
*/
void shadow_vcpu_init(struct vcpu *v)
{
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ int i;
+
+ for ( i = 0; i < SHADOW_OOS_PAGES; i++ )
+ v->arch.paging.shadow.oos[i] = _mfn(INVALID_MFN);
+#endif
+
v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
}
@@ -427,6 +438,404 @@ void shadow_continue_emulation(struct sh
}
}
}
+
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+/**************************************************************************/
+/* Out-of-sync shadows. */
+
+/* From time to time, we let a shadowed pagetable page go out of sync
+ * with its shadow: the guest is allowed to write directly to the page,
+ * and those writes are not synchronously reflected in the shadow.
+ * This lets us avoid many emulations if the guest is writing a lot to a
+ * pagetable, but it relaxes a pretty important invariant in the shadow
+ * pagetable design. Therefore, some rules:
+ *
+ * 1. Only L1 pagetables may go out of sync: any page that is shadowed
+ * at at higher level must be synchronously updated. This makes
+ * using linear shadow pagetables much less dangerous.
+ * That means that: (a) unsyncing code needs to check for higher-level
+ * shadows, and (b) promotion code needs to resync.
+ *
+ * 2. All shadow operations on a guest page require the page to be brought
+ * back into sync before proceeding. This must be done under the
+ * shadow lock so that the page is guaranteed to remain synced until
+ * the operation completes.
+ *
+ * Exceptions to this rule: the pagefault and invlpg handlers may
+ * update only one entry on an out-of-sync page without resyncing it.
+ *
+ * 3. Operations on shadows that do not start from a guest page need to
+ * be aware that they may be handling an out-of-sync shadow.
+ *
+ * 4. Operations that do not normally take the shadow lock (fast-path
+ * #PF handler, INVLPG) must fall back to a locking, syncing version
+ * if they see an out-of-sync table.
+ *
+ * 5. Operations corresponding to guest TLB flushes (MOV CR3, INVLPG)
+ * must explicitly resync all relevant pages or update their
+ * shadows.
+ *
+ * Currently out-of-sync pages are listed in a simple open-addressed
+ * hash table with a second chance (must resist temptation to radically
+ * over-engineer hash tables...) The virtual address of the access
+ * which caused us to unsync the page is also kept in the hash table, as
+ * a hint for finding the writable mappings later.
+ *
+ * We keep a hash per vcpu, because we want as much as possible to do
+ * the re-sync on the save vcpu we did the unsync on, so the VA hint
+ * will be valid.
+ */
+
+
+#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
+static void sh_oos_audit(struct domain *d)
+{
+ int idx, expected_idx, expected_idx_alt;
+ struct page_info *pg;
+ struct vcpu *v;
+
+ for_each_vcpu(d, v)
+ {
+ for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
+ {
+ mfn_t *oos = v->arch.paging.shadow.oos;
+ if ( !mfn_valid(oos[idx]) )
+ continue;
+
+ expected_idx = mfn_x(oos[idx]) % SHADOW_OOS_PAGES;
+ expected_idx_alt = ((expected_idx + 1) % SHADOW_OOS_PAGES);
+ if ( idx != expected_idx && idx != expected_idx_alt )
+ {
+ printk("%s: idx %d contains gmfn %lx, expected at %d or %d.\n",
+ __func__, idx, mfn_x(oos[idx]),
+ expected_idx, expected_idx_alt);
+ BUG();
+ }
+ pg = mfn_to_page(oos[idx]);
+ if ( !(pg->count_info & PGC_page_table) )
+ {
+ printk("%s: idx %x gmfn %lx not a pt (count %"PRIx32")\n",
+ __func__, idx, mfn_x(oos[idx]), pg->count_info);
+ BUG();
+ }
+ if ( !(pg->shadow_flags & SHF_out_of_sync) )
+ {
+ printk("%s: idx %x gmfn %lx not marked oos (flags %lx)\n",
+ __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
+ BUG();
+ }
+ if ( (pg->shadow_flags & SHF_page_type_mask & ~SHF_L1_ANY) )
+ {
+ printk("%s: idx %x gmfn %lx shadowed as non-l1 (flags %lx)\n",
+ __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
+ BUG();
+ }
+ }
+ }
+}
+#endif
+
+#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
+void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn)
+{
+ int idx;
+ struct vcpu *v;
+ mfn_t *oos;
+
+ ASSERT(mfn_is_out_of_sync(gmfn));
+
+ for_each_vcpu(d, v)
+ {
+ oos = v->arch.paging.shadow.oos;
+ idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+ if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
+ idx = (idx + 1) % SHADOW_OOS_PAGES;
+
+ if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
+ return;
+ }
+
+ SHADOW_ERROR("gmfn %lx marked OOS but not in hash table\n", mfn_x(gmfn));
+ BUG();
+}
+#endif
+
+/* Update the shadow, but keep the page out of sync. */
+static inline void _sh_resync_l1(struct vcpu *v, mfn_t gmfn)
+{
+ struct page_info *pg = mfn_to_page(gmfn);
+
+ ASSERT(mfn_valid(gmfn));
+ ASSERT(page_is_out_of_sync(pg));
+
+ /* Call out to the appropriate per-mode resyncing function */
+ if ( pg->shadow_flags & SHF_L1_32 )
+ SHADOW_INTERNAL_NAME(sh_resync_l1, 2)(v, gmfn);
+ else if ( pg->shadow_flags & SHF_L1_PAE )
+ SHADOW_INTERNAL_NAME(sh_resync_l1, 3)(v, gmfn);
+#if CONFIG_PAGING_LEVELS >= 4
+ else if ( pg->shadow_flags & SHF_L1_64 )
+ SHADOW_INTERNAL_NAME(sh_resync_l1, 4)(v, gmfn);
+#endif
+}
+
+/* Pull all the entries on an out-of-sync page back into sync. */
+static void _sh_resync(struct vcpu *v, mfn_t gmfn, unsigned long va)
+{
+ struct page_info *pg = mfn_to_page(gmfn);
+
+ ASSERT(shadow_locked_by_me(v->domain));
+ ASSERT(mfn_is_out_of_sync(gmfn));
+ /* Guest page must be shadowed *only* as L1 when out of sync. */
+ ASSERT(!(mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask
+ & ~SHF_L1_ANY));
+ ASSERT(!sh_page_has_multiple_shadows(mfn_to_page(gmfn)));
+
+ SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, va=%lx\n",
+ v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va);
+
+ /* Need to pull write access so the page *stays* in sync.
+ * This might be rather slow but we hope that in the common case
+ * we're handling this pagetable after a guest walk has pulled
+ * write access the fast way. */
+ switch ( sh_remove_write_access(v, gmfn, 0, va) )
+ {
+ default:
+ case 0:
+ break;
+
+ case 1:
+ flush_tlb_mask(v->domain->domain_dirty_cpumask);
+ break;
+
+ case -1:
+ /* An unfindable writeable typecount has appeared, probably via a
+ * grant table entry: can't shoot the mapping, so try to unshadow
+ * the page. If that doesn't work either, the guest is granting
+ * his pagetables and must be killed after all. */
+ sh_remove_shadows(v, gmfn, 0 /* Be thorough */, 1 /* Must succeed */);
+ return;
+ }
+
+ /* No more writable mappings of this page, please */
+ pg->shadow_flags &= ~SHF_oos_may_write;
+
+ /* Update the shadows with current guest entries. */
+ _sh_resync_l1(v, gmfn);
+
+ /* Now we know all the entries are synced, and will stay that way */
+ pg->shadow_flags &= ~SHF_out_of_sync;
+ perfc_incr(shadow_resync);
+}
+
+
+/* Add an MFN to the list of out-of-sync guest pagetables */
+static void oos_hash_add(struct vcpu *v, mfn_t gmfn, unsigned long va)
+{
+ int idx;
+ mfn_t *oos = v->arch.paging.shadow.oos;
+ unsigned long *oos_va = v->arch.paging.shadow.oos_va;
+
+ idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+ if ( mfn_valid(oos[idx])
+ && (mfn_x(oos[idx]) % SHADOW_OOS_PAGES) == idx )
+ {
+ /* Punt the current occupant into the next slot */
+ SWAP(oos[idx], gmfn);
+ SWAP(oos_va[idx], va);
+ idx = (idx + 1) % SHADOW_OOS_PAGES;
+ }
+ if ( mfn_valid(oos[idx]) )
+ {
+ /* Crush the current occupant. */
+ _sh_resync(v, oos[idx], oos_va[idx]);
+ perfc_incr(shadow_unsync_evict);
+ }
+ oos[idx] = gmfn;
+ oos_va[idx] = va;
+}
+
+/* Remove an MFN from the list of out-of-sync guest pagetables */
+static void oos_hash_remove(struct vcpu *v, mfn_t gmfn)
+{
+ int idx;
+ mfn_t *oos;
+ struct domain *d = v->domain;
+
+ SHADOW_PRINTK("D%dV%d gmfn %lx\n",
+ v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
+
+ for_each_vcpu(d, v)
+ {
+ oos = v->arch.paging.shadow.oos;
+ idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+ if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
+ idx = (idx + 1) % SHADOW_OOS_PAGES;
+ if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
+ {
+ oos[idx] = _mfn(INVALID_MFN);
+ return;
+ }
+ }
+
+ SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
+ BUG();
+}
+
+/* Pull a single guest page back into sync */
+void sh_resync(struct vcpu *v, mfn_t gmfn)
+{
+ int idx;
+ mfn_t *oos;
+ unsigned long *oos_va;
+ struct domain *d = v->domain;
+
+ for_each_vcpu(d, v)
+ {
+ oos = v->arch.paging.shadow.oos;
+ oos_va = v->arch.paging.shadow.oos_va;
+ idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+ if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
+ idx = (idx + 1) % SHADOW_OOS_PAGES;
+
+ if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
+ {
+ _sh_resync(v, gmfn, oos_va[idx]);
+ oos[idx] = _mfn(INVALID_MFN);
+ return;
+ }
+ }
+
+ SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
+ BUG();
+}
+
+/* Figure out whether it's definitely safe not to sync this l1 table,
+ * by making a call out to the mode in which that shadow was made. */
+static int sh_skip_sync(struct vcpu *v, mfn_t gl1mfn)
+{
+ struct page_info *pg = mfn_to_page(gl1mfn);
+ if ( pg->shadow_flags & SHF_L1_32 )
+ return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 2)(v, gl1mfn);
+ else if ( pg->shadow_flags & SHF_L1_PAE )
+ return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 3)(v, gl1mfn);
+#if CONFIG_PAGING_LEVELS >= 4
+ else if ( pg->shadow_flags & SHF_L1_64 )
+ return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 4)(v, gl1mfn);
+#endif
+ SHADOW_ERROR("gmfn 0x%lx was OOS but not shadowed as an l1.\n",
+ mfn_x(gl1mfn));
+ BUG();
+ return 0; /* BUG() is no longer __attribute__((noreturn)). */
+}
+
+
+/* Pull all out-of-sync pages back into sync. Pages brought out of sync
+ * on other vcpus are allowed to remain out of sync, but their contents
+ * will be made safe (TLB flush semantics); pages unsynced by this vcpu
+ * are brought back into sync and write-protected. If skip != 0, we try
+ * to avoid resyncing at all if we think we can get away with it. */
+void sh_resync_all(struct vcpu *v, int skip, int this, int others, int
do_locking)
+{
+ int idx;
+ struct vcpu *other;
+ mfn_t *oos = v->arch.paging.shadow.oos;
+ unsigned long *oos_va = v->arch.paging.shadow.oos_va;
+
+ SHADOW_PRINTK("d=%d, v=%d\n", v->domain->domain_id, v->vcpu_id);
+
+ ASSERT(do_locking || shadow_locked_by_me(v->domain));
+
+ if ( !this )
+ goto resync_others;
+
+ if ( do_locking )
+ shadow_lock(v->domain);
+
+ /* First: resync all of this vcpu's oos pages */
+ for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
+ if ( mfn_valid(oos[idx]) )
+ {
+ /* Write-protect and sync contents */
+ _sh_resync(v, oos[idx], oos_va[idx]);
+ oos[idx] = _mfn(INVALID_MFN);
+ }
+
+ if ( do_locking )
+ shadow_unlock(v->domain);
+
+ resync_others:
+ if ( !others )
+ return;
+
+ /* Second: make all *other* vcpus' oos pages safe. */
+ for_each_vcpu(v->domain, other)
+ {
+ if ( v == other )
+ continue;
+
+ if ( do_locking )
+ shadow_lock(v->domain);
+
+ oos = other->arch.paging.shadow.oos;
+ oos_va = other->arch.paging.shadow.oos_va;
+
+ for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
+ {
+ if ( !mfn_valid(oos[idx]) )
+ continue;
+
+ if ( skip )
+ {
+ /* Update the shadows and leave the page OOS. */
+ if ( sh_skip_sync(v, oos[idx]) )
+ continue;
+ _sh_resync_l1(other, oos[idx]);
+ }
+ else
+ {
+ /* Write-protect and sync contents */
+ _sh_resync(other, oos[idx], oos_va[idx]);
+ oos[idx] = _mfn(INVALID_MFN);
+ }
+ }
+
+ if ( do_locking )
+ shadow_unlock(v->domain);
+ }
+}
+
+/* Allow a shadowed page to go out of sync */
+int sh_unsync(struct vcpu *v, mfn_t gmfn, unsigned long va)
+{
+ struct page_info *pg;
+
+ ASSERT(shadow_locked_by_me(v->domain));
+
+ SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx va %lx\n",
+ v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va);
+
+ pg = mfn_to_page(gmfn);
+
+ /* Guest page must be shadowed *only* as L1 and *only* once when out
+ * of sync. Also, get out now if it's already out of sync.
+ * Also, can't safely unsync if some vcpus have paging disabled.*/
+ if ( pg->shadow_flags &
+ ((SHF_page_type_mask & ~SHF_L1_ANY) | SHF_out_of_sync)
+ || sh_page_has_multiple_shadows(pg)
+ || !is_hvm_domain(v->domain)
+ || !v->domain->arch.paging.shadow.oos_active )
+ return 0;
+
+ pg->shadow_flags |= SHF_out_of_sync|SHF_oos_may_write;
+ oos_hash_add(v, gmfn, va);
+ perfc_incr(shadow_unsync);
+ return 1;
+}
+
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
+
/**************************************************************************/
/* Code for "promoting" a guest page to the point where the shadow code is
@@ -439,6 +848,12 @@ void shadow_promote(struct vcpu *v, mfn_
struct page_info *page = mfn_to_page(gmfn);
ASSERT(mfn_valid(gmfn));
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Is the page already shadowed and out of sync? */
+ if ( page_is_out_of_sync(page) )
+ sh_resync(v, gmfn);
+#endif
/* We should never try to promote a gmfn that has writeable mappings */
ASSERT((page->u.inuse.type_info & PGT_type_mask) != PGT_writable_page
@@ -463,7 +878,14 @@ void shadow_demote(struct vcpu *v, mfn_t
clear_bit(type, &page->shadow_flags);
if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
+ {
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Was the page out of sync? */
+ if ( page_is_out_of_sync(page) )
+ oos_hash_remove(v, gmfn);
+#endif
clear_bit(_PGC_page_table, &page->count_info);
+ }
}
/**************************************************************************/
@@ -1297,6 +1719,27 @@ static void sh_hash_audit_bucket(struct
/* Bad shadow flags on guest page? */
BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) );
/* Bad type count on guest page? */
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ if ( sp->type == SH_type_l1_32_shadow
+ || sp->type == SH_type_l1_pae_shadow
+ || sp->type == SH_type_l1_64_shadow )
+ {
+ if ( (gpg->u.inuse.type_info & PGT_type_mask) ==
PGT_writable_page
+ && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
+ {
+ if ( !page_is_out_of_sync(gpg) )
+ {
+ SHADOW_ERROR("MFN %#lx shadowed (by %#"PRI_mfn")"
+ " and not OOS but has typecount %#lx\n",
+ sp->backpointer,
+ mfn_x(shadow_page_to_mfn(sp)),
+ gpg->u.inuse.type_info);
+ BUG();
+ }
+ }
+ }
+ else /* Not an l1 */
+#endif
if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
&& (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
{
@@ -1608,7 +2051,8 @@ void sh_destroy_shadow(struct vcpu *v, m
/* Remove all writeable mappings of a guest frame from the shadow tables
* Returns non-zero if we need to flush TLBs.
* level and fault_addr desribe how we found this to be a pagetable;
- * level==0 means we have some other reason for revoking write access.*/
+ * level==0 means we have some other reason for revoking write access.
+ * If level==0 we are allowed to fail, returning -1. */
int sh_remove_write_access(struct vcpu *v, mfn_t gmfn,
unsigned int level,
@@ -1659,7 +2103,12 @@ int sh_remove_write_access(struct vcpu *
return 0;
/* Early exit if it's already a pagetable, or otherwise not writeable */
- if ( sh_mfn_is_a_page_table(gmfn)
+ if ( (sh_mfn_is_a_page_table(gmfn)
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Unless they've been allowed to go out of sync with their shadows */
+ && !mfn_oos_may_write(gmfn)
+#endif
+ )
|| (pg->u.inuse.type_info & PGT_count_mask) == 0 )
return 0;
@@ -1676,7 +2125,7 @@ int sh_remove_write_access(struct vcpu *
}
#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
- if ( v == current && level != 0 )
+ if ( v == current )
{
unsigned long gfn;
/* Heuristic: there is likely to be only one writeable mapping,
@@ -1690,6 +2139,8 @@ int sh_remove_write_access(struct vcpu *
return 1; \
} while (0)
+ if ( level == 0 && fault_addr )
+ GUESS(fault_addr, 6);
if ( v->arch.paging.mode->guest_levels == 2 )
{
@@ -1780,6 +2231,9 @@ int sh_remove_write_access(struct vcpu *
* mapping -- ioreq page, grant mapping, &c. */
if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
{
+ if ( level == 0 )
+ return -1;
+
SHADOW_ERROR("can't remove write access to mfn %lx: guest has "
"%lu special-use mappings of it\n", mfn_x(gmfn),
(mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
@@ -2159,6 +2613,13 @@ static void sh_update_paging_modes(struc
ASSERT(shadow_mode_translate(d));
ASSERT(shadow_mode_external(d));
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Need to resync all our pages now, because if a page goes out
+ * of sync with paging enabled and is resynced with paging
+ * disabled, the resync will go wrong. */
+ shadow_resync_all(v, 0);
+#endif /* OOS */
+
if ( !hvm_paging_enabled(v) )
{
/* When the guest has CR0.PG clear, we provide a 32-bit, non-PAE
@@ -2253,6 +2714,27 @@ static void sh_update_paging_modes(struc
// different values for CR4.PSE and CR4.PGE at the same time.
// This *does* happen, at least for CR4.PGE...
}
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* We need to check that all the vcpus have paging enabled to
+ * unsync PTs. */
+ if ( is_hvm_domain(d) )
+ {
+ int pe = 1;
+ struct vcpu *vptr;
+
+ for_each_vcpu(d, vptr)
+ {
+ if ( !hvm_paging_enabled(vptr) )
+ {
+ pe = 0;
+ break;
+ }
+ }
+
+ d->arch.paging.shadow.oos_active = pe;
+ }
+#endif /* OOS */
v->arch.paging.mode->update_cr3(v, 0);
}
@@ -3044,7 +3526,11 @@ void shadow_audit_tables(struct vcpu *v)
if ( !(SHADOW_AUDIT_ENABLE) )
return;
-
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ sh_oos_audit(v->domain);
+#endif
+
if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
mask = ~1; /* Audit every table in the system */
else
diff -r 26ecd1f9e128 xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/arch/x86/mm/shadow/multi.c Fri Jun 20 15:10:08 2008 +0100
@@ -305,22 +305,54 @@ shadow_check_gwalk(struct vcpu *v, unsig
}
/* Remove write access permissions from a gwalk_t in a batch, and
- * return OR-ed result for TLB flush hint
- */
+ * return OR-ed result for TLB flush hint and need to rewalk the guest
+ * pages.
+ *
+ * Syncing pages will remove write access to that page; but it may
+ * also give write access to other pages in the path. If we resync any
+ * pages, re-walk from the beginning.
+ */
+#define GW_RMWR_FLUSHTLB 1
+#define GW_RMWR_REWALK 2
+
static inline uint32_t
gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
{
- int rc = 0;
+ uint32_t rc = 0;
#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
- rc = sh_remove_write_access(v, gw->l3mfn, 3, va);
-#endif
- rc |= sh_remove_write_access(v, gw->l2mfn, 2, va);
-#endif
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ if ( mfn_is_out_of_sync(gw->l3mfn) )
+ {
+ sh_resync(v, gw->l3mfn);
+ rc = GW_RMWR_REWALK;
+ }
+ else
+#endif /* OOS */
+ if ( sh_remove_write_access(v, gw->l3mfn, 3, va) )
+ rc = GW_RMWR_FLUSHTLB;
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ if ( mfn_is_out_of_sync(gw->l2mfn) )
+ {
+ sh_resync(v, gw->l2mfn);
+ rc |= GW_RMWR_REWALK;
+ }
+ else
+#endif /* OOS */
+ if ( sh_remove_write_access(v, gw->l2mfn, 2, va) )
+ rc |= GW_RMWR_FLUSHTLB;
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
if ( !(guest_supports_superpages(v) &&
- (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
- rc |= sh_remove_write_access(v, gw->l1mfn, 1, va);
+ (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE))
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ && !mfn_is_out_of_sync(gw->l1mfn)
+#endif /* OOS */
+ && sh_remove_write_access(v, gw->l1mfn, 1, va) )
+ rc |= GW_RMWR_FLUSHTLB;
return rc;
}
@@ -882,7 +914,12 @@ _sh_propagate(struct vcpu *v,
// protect guest page tables
//
- if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn)) )
+ if ( unlikely((level == 1)
+ && sh_mfn_is_a_page_table(target_mfn)
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
+ && !mfn_oos_may_write(target_mfn)
+#endif /* OOS */
+ ) )
{
if ( shadow_mode_trap_reads(d) )
{
@@ -1125,6 +1162,9 @@ static int shadow_set_l4e(struct vcpu *v
domain_crash(v->domain);
return SHADOW_SET_ERROR;
}
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
+ shadow_resync_all(v, 0);
+#endif
}
/* Write the new entry */
@@ -1163,12 +1203,17 @@ static int shadow_set_l3e(struct vcpu *v
| (((unsigned long)sl3e) & ~PAGE_MASK));
if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
+ {
/* About to install a new reference */
if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
{
domain_crash(v->domain);
return SHADOW_SET_ERROR;
- }
+ }
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
+ shadow_resync_all(v, 0);
+#endif
+ }
/* Write the new entry */
shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
@@ -1219,12 +1264,29 @@ static int shadow_set_l2e(struct vcpu *v
| (((unsigned long)sl2e) & ~PAGE_MASK));
if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
+ {
+ mfn_t sl1mfn = shadow_l2e_get_mfn(new_sl2e);
+
/* About to install a new reference */
- if ( !sh_get_ref(v, shadow_l2e_get_mfn(new_sl2e), paddr) )
+ if ( !sh_get_ref(v, sl1mfn, paddr) )
{
domain_crash(v->domain);
return SHADOW_SET_ERROR;
- }
+ }
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ {
+ struct shadow_page_info *sp = mfn_to_shadow_page(sl1mfn);
+ mfn_t gl1mfn = _mfn(sp->backpointer);
+
+ /* If the shadow is a fl1 then the backpointer contains
+ the GFN instead of the GMFN, and it's definitely not
+ OOS. */
+ if ( (sp->type != SH_type_fl1_shadow) && mfn_valid(gl1mfn)
+ && mfn_is_out_of_sync(gl1mfn) )
+ sh_resync(v, gl1mfn);
+ }
+#endif
+ }
/* Write the new entry */
#if GUEST_PAGING_LEVELS == 2
@@ -2544,6 +2606,97 @@ static int validate_gl1e(struct vcpu *v,
return result;
}
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+/**************************************************************************/
+/* Special validation function for re-syncing out-of-sync shadows.
+ * Walks the *shadow* page, and for every entry that it finds,
+ * revalidates the guest entry that corresponds to it.
+ * N.B. This function is called with the vcpu that unsynced the page,
+ * *not* the one that is causing it to be resynced. */
+void sh_resync_l1(struct vcpu *v, mfn_t gmfn)
+{
+ mfn_t sl1mfn;
+ shadow_l1e_t *sl1p;
+ guest_l1e_t *gl1p, *gp;
+ int rc = 0;
+
+ sl1mfn = get_shadow_status(v, gmfn, SH_type_l1_shadow);
+ ASSERT(mfn_valid(sl1mfn)); /* Otherwise we would not have been called */
+
+ gp = sh_map_domain_page(gmfn);
+ gl1p = gp;
+
+ SHADOW_FOREACH_L1E(sl1mfn, sl1p, &gl1p, 0, {
+ rc |= validate_gl1e(v, gl1p, sl1mfn, sl1p);
+ });
+
+ sh_unmap_domain_page(gp);
+
+ /* Setting shadow L1 entries should never need us to flush the TLB */
+ ASSERT(!(rc & SHADOW_SET_FLUSH));
+}
+
+/* Figure out whether it's definitely safe not to sync this l1 table.
+ * That is: if we can tell that it's only used once, and that the
+ * toplevel shadow responsible is not one of ours.
+ * N.B. This function is called with the vcpu that required the resync,
+ * *not* the one that originally unsynced the page, but it is
+ * called in the *mode* of the vcpu that unsynced it. Clear? Good. */
+int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn)
+{
+ struct shadow_page_info *sp;
+ mfn_t smfn;
+
+ smfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
+ ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
+
+ /* Up to l2 */
+ sp = mfn_to_shadow_page(smfn);
+ if ( sp->count != 1 || !sp->up )
+ return 0;
+ smfn = _mfn(sp->up >> PAGE_SHIFT);
+ ASSERT(mfn_valid(smfn));
+
+#if (SHADOW_PAGING_LEVELS == 4)
+ /* up to l3 */
+ sp = mfn_to_shadow_page(smfn);
+ if ( sp->count != 1 || !sp->up )
+ return 0;
+ smfn = _mfn(sp->up >> PAGE_SHIFT);
+ ASSERT(mfn_valid(smfn));
+
+ /* up to l4 */
+ sp = mfn_to_shadow_page(smfn);
+ if ( sp->count != 1
+ || sh_type_is_pinnable(v, SH_type_l3_64_shadow) || !sp->up )
+ return 0;
+ smfn = _mfn(sp->up >> PAGE_SHIFT);
+ ASSERT(mfn_valid(smfn));
+
+#if (GUEST_PAGING_LEVELS == 2)
+ /* In 2-on-3 shadow mode the up pointer contains the link to the
+ * shadow page, but the shadow_table contains only the first of the
+ * four pages that makes the PAE top shadow tables. */
+ smfn = _mfn(mfn_x(smfn) & ~0x3UL);
+#endif
+
+#endif
+
+ if ( pagetable_get_pfn(v->arch.shadow_table[0]) == mfn_x(smfn)
+#if (SHADOW_PAGING_LEVELS == 3)
+ || pagetable_get_pfn(v->arch.shadow_table[1]) == mfn_x(smfn)
+ || pagetable_get_pfn(v->arch.shadow_table[2]) == mfn_x(smfn)
+ || pagetable_get_pfn(v->arch.shadow_table[3]) == mfn_x(smfn)
+#endif
+ )
+ return 0;
+
+ /* Only in use in one toplevel shadow, and it's not the one we're
+ * running on */
+ return 1;
+}
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
+
/**************************************************************************/
/* Functions which translate and install the shadows of arbitrary guest
@@ -2805,6 +2958,7 @@ static int sh_page_fault(struct vcpu *v,
int r;
fetch_type_t ft = 0;
p2m_type_t p2mt;
+ uint32_t rc;
#if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
int fast_emul = 0;
#endif
@@ -2830,6 +2984,17 @@ static int sh_page_fault(struct vcpu *v,
{
fast_emul = 1;
gmfn = _mfn(v->arch.paging.shadow.last_emulated_mfn);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Fall back to the slow path if we're trying to emulate
+ writes to an out of sync page. */
+ if ( mfn_valid(gmfn) && mfn_is_out_of_sync(gmfn) )
+ {
+ v->arch.paging.last_write_emul_ok = 0;
+ goto page_fault_slow_path;
+ }
+#endif /* OOS */
+
perfc_incr(shadow_fault_fast_emulate);
goto early_emulation;
}
@@ -2855,6 +3020,31 @@ static int sh_page_fault(struct vcpu *v,
sizeof(sl1e)) == 0)
&& sh_l1e_is_magic(sl1e)) )
{
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* First, need to check that this isn't an out-of-sync
+ * shadow l1e. If it is, we fall back to the slow path, which
+ * will sync it up again. */
+ {
+ shadow_l2e_t sl2e;
+ mfn_t gl1mfn;
+ if ( (__copy_from_user(&sl2e,
+ (sh_linear_l2_table(v)
+ + shadow_l2_linear_offset(va)),
+ sizeof(sl2e)) != 0)
+ || !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT)
+ || !mfn_valid(gl1mfn = _mfn(mfn_to_shadow_page(
+ shadow_l2e_get_mfn(sl2e))->backpointer))
+ || unlikely(mfn_is_out_of_sync(gl1mfn)) )
+ {
+ /* Hit the slow path as if there had been no
+ * shadow entry at all, and let it tidy up */
+ ASSERT(regs->error_code & PFEC_page_present);
+ regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
+ goto page_fault_slow_path;
+ }
+ }
+#endif /* SHOPT_OUT_OF_SYNC */
+
if ( sh_l1e_is_gnp(sl1e) )
{
/* Not-present in a guest PT: pass to the guest as
@@ -2890,6 +3080,10 @@ static int sh_page_fault(struct vcpu *v,
return EXCRET_fault_fixed;
}
}
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ page_fault_slow_path:
+#endif
#endif /* SHOPT_FAST_FAULT_PATH */
/* Detect if this page fault happened while we were already in Xen
@@ -2904,7 +3098,21 @@ static int sh_page_fault(struct vcpu *v,
return 0;
}
- if ( guest_walk_tables(v, va, &gw, regs->error_code) != 0 )
+ rewalk:
+ rc = guest_walk_tables(v, va, &gw, regs->error_code);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ if ( !(rc & _PAGE_PRESENT) )
+ regs->error_code |= PFEC_page_present;
+ else if ( regs->error_code & PFEC_page_present )
+ {
+ SHADOW_ERROR("OOS paranoia: Something is wrong in guest TLB"
+ " flushing. Have fun debugging it.\n");
+ regs->error_code &= ~PFEC_page_present;
+ }
+#endif
+
+ if ( rc != 0 )
{
perfc_incr(shadow_fault_bail_real_fault);
SHADOW_PRINTK("not a shadow fault\n");
@@ -2948,7 +3156,10 @@ static int sh_page_fault(struct vcpu *v,
shadow_lock(d);
- if ( gw_remove_write_accesses(v, va, &gw) )
+ rc = gw_remove_write_accesses(v, va, &gw);
+
+ /* First bit set: Removed write access to a page. */
+ if ( rc & GW_RMWR_FLUSHTLB )
{
/* Write permission removal is also a hint that other gwalks
* overlapping with this one may be inconsistent
@@ -2958,11 +3169,20 @@ static int sh_page_fault(struct vcpu *v,
flush_tlb_mask(d->domain_dirty_cpumask);
}
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Second bit set: Resynced a page. Re-walk needed. */
+ if ( rc & GW_RMWR_REWALK )
+ {
+ shadow_unlock(d);
+ goto rewalk;
+ }
+#endif /* OOS */
+
if ( !shadow_check_gwalk(v, va, &gw) )
{
perfc_incr(shadow_inconsistent_gwalk);
shadow_unlock(d);
- return EXCRET_fault_fixed;
+ goto rewalk;
}
shadow_audit_tables(v);
@@ -3001,7 +3221,12 @@ static int sh_page_fault(struct vcpu *v,
#endif
/* Need to emulate accesses to page tables */
- if ( sh_mfn_is_a_page_table(gmfn) )
+ if ( sh_mfn_is_a_page_table(gmfn)
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Unless they've been allowed to go out of sync with their shadows */
+ && !mfn_is_out_of_sync(gmfn)
+#endif
+ )
{
if ( ft == ft_demand_write )
{
@@ -3215,6 +3440,7 @@ sh_invlpg(struct vcpu *v, unsigned long
* instruction should be issued on the hardware, or 0 if it's safe not
* to do so. */
{
+ mfn_t sl1mfn;
shadow_l2e_t sl2e;
perfc_incr(shadow_invlpg);
@@ -3278,12 +3504,64 @@ sh_invlpg(struct vcpu *v, unsigned long
// If so, then we'll need to flush the entire TLB (because that's
// easier than invalidating all of the individual 4K pages).
//
- if ( mfn_to_shadow_page(shadow_l2e_get_mfn(sl2e))->type
+ sl1mfn = shadow_l2e_get_mfn(sl2e);
+ if ( mfn_to_shadow_page(sl1mfn)->type
== SH_type_fl1_shadow )
{
flush_tlb_local();
return 0;
}
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Check to see if the SL1 is out of sync. */
+ {
+ mfn_t gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+ struct page_info *pg = mfn_to_page(gl1mfn);
+ if ( mfn_valid(gl1mfn)
+ && page_is_out_of_sync(pg) )
+ {
+ /* The test above may give false positives, since we don't
+ * hold the shadow lock yet. Check again with the lock held. */
+ shadow_lock(v->domain);
+
+ /* This must still be a copy-from-user because we didn't
+ * have the shadow lock last time we checked, and the
+ * higher-level shadows might have disappeared under our
+ * feet. */
+ if ( __copy_from_user(&sl2e,
+ sh_linear_l2_table(v)
+ + shadow_l2_linear_offset(va),
+ sizeof (sl2e)) != 0 )
+ {
+ perfc_incr(shadow_invlpg_fault);
+ shadow_unlock(v->domain);
+ return 0;
+ }
+
+ if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
+ {
+ shadow_unlock(v->domain);
+ return 0;
+ }
+
+ sl1mfn = shadow_l2e_get_mfn(sl2e);
+ gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+ pg = mfn_to_page(gl1mfn);
+
+ if ( likely(sh_mfn_is_a_page_table(gl1mfn)
+ && page_is_out_of_sync(pg) ) )
+ {
+ shadow_l1e_t *sl1;
+ sl1 = sh_linear_l1_table(v) + shadow_l1_linear_offset(va);
+ /* Remove the shadow entry that maps this VA */
+ (void) shadow_set_l1e(v, sl1, shadow_l1e_empty(), sl1mfn);
+ }
+ shadow_unlock(v->domain);
+ /* Need the invlpg, to pick up the disappeareance of the sl1e */
+ return 1;
+ }
+ }
+#endif
return 1;
}
@@ -3709,6 +3987,13 @@ sh_update_cr3(struct vcpu *v, int do_loc
ASSERT(v->arch.cr3 == 0);
return;
}
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Need to resync all the shadow entries on a TLB flush. Resync
+ * current vcpus OOS pages before switching to the new shadow
+ * tables so that the VA hint is still valid. */
+ shadow_resync_current_vcpu(v, do_locking);
+#endif
if ( do_locking ) shadow_lock(v->domain);
@@ -3938,6 +4223,15 @@ sh_update_cr3(struct vcpu *v, int do_loc
/* Release the lock, if we took it (otherwise it's the caller's problem) */
if ( do_locking ) shadow_unlock(v->domain);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Need to resync all the shadow entries on a TLB flush. We only
+ * update the shadows, leaving the pages out of sync. Also, we try
+ * to skip synchronization of shadows not mapped in the new
+ * tables. */
+ shadow_sync_other_vcpus(v, do_locking);
+#endif
+
}
@@ -4437,23 +4731,35 @@ sh_x86_emulate_cmpxchg8b(struct vcpu *v,
#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
-#define AUDIT_FAIL(_level, _fmt, _a...) do { \
- printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
- "gl" #_level "mfn = %" PRI_mfn \
- " sl" #_level "mfn = %" PRI_mfn \
- " &gl" #_level "e = %p &sl" #_level "e = %p" \
- " gl" #_level "e = %" SH_PRI_gpte \
- " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
- GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
- _level, guest_index(gl ## _level ## e), \
- mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
- gl ## _level ## e, sl ## _level ## e, \
- gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
- ##_a); \
- BUG(); \
- done = 1; \
-} while (0)
-
+#define AUDIT_FAIL(_level, _fmt, _a...) do { \
+ printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
+ "gl" #_level "mfn = %" PRI_mfn \
+ " sl" #_level "mfn = %" PRI_mfn \
+ " &gl" #_level "e = %p &sl" #_level "e = %p" \
+ " gl" #_level "e = %" SH_PRI_gpte \
+ " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
+ GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
+ _level, guest_index(gl ## _level ## e), \
+ mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
+ gl ## _level ## e, sl ## _level ## e, \
+ gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level,
\
+ ##_a); \
+ BUG(); \
+ done = 1; \
+} while (0)
+
+#define AUDIT_FAIL_MIN(_level, _fmt, _a...) do { \
+ printk("Shadow %u-on-%u audit failed at level %i\n" \
+ "gl" #_level "mfn = %" PRI_mfn \
+ " sl" #_level "mfn = %" PRI_mfn \
+ " Error: " _fmt "\n", \
+ GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
+ _level, \
+ mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
+ ##_a); \
+ BUG(); \
+ done = 1; \
+} while (0)
static char * sh_audit_flags(struct vcpu *v, int level,
int gflags, int sflags)
@@ -4494,6 +4800,16 @@ int sh_audit_l1_table(struct vcpu *v, mf
/* Follow the backpointer */
gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Out-of-sync l1 shadows can contain anything: just check the OOS hash */
+ if ( page_is_out_of_sync(mfn_to_page(gl1mfn)) )
+ {
+ oos_audit_hash_is_present(v->domain, gl1mfn);
+ return 0;
+ }
+#endif
+
gl1e = gp = sh_map_domain_page(gl1mfn);
SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
@@ -4574,6 +4890,13 @@ int sh_audit_l2_table(struct vcpu *v, mf
/* Follow the backpointer */
gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Only L1's may be out of sync. */
+ if ( page_is_out_of_sync(mfn_to_page(gl2mfn)) )
+ AUDIT_FAIL_MIN(2, "gmfn %lx is out of sync", mfn_x(gl2mfn));
+#endif
+
gl2e = gp = sh_map_domain_page(gl2mfn);
SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, {
@@ -4616,6 +4939,13 @@ int sh_audit_l3_table(struct vcpu *v, mf
/* Follow the backpointer */
gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Only L1's may be out of sync. */
+ if ( page_is_out_of_sync(mfn_to_page(gl3mfn)) )
+ AUDIT_FAIL_MIN(3, "gmfn %lx is out of sync", mfn_x(gl3mfn));
+#endif
+
gl3e = gp = sh_map_domain_page(gl3mfn);
SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
@@ -4656,6 +4986,13 @@ int sh_audit_l4_table(struct vcpu *v, mf
/* Follow the backpointer */
gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+ /* Only L1's may be out of sync. */
+ if ( page_is_out_of_sync(mfn_to_page(gl4mfn)) )
+ AUDIT_FAIL_MIN(4, "gmfn %lx is out of sync", mfn_x(gl4mfn));
+#endif
+
gl4e = gp = sh_map_domain_page(gl4mfn);
SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain,
{
diff -r 26ecd1f9e128 xen/arch/x86/mm/shadow/multi.h
--- a/xen/arch/x86/mm/shadow/multi.h Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/arch/x86/mm/shadow/multi.h Fri Jun 20 15:10:08 2008 +0100
@@ -115,3 +115,13 @@ SHADOW_INTERNAL_NAME(sh_destroy_monitor_
extern struct paging_mode
SHADOW_INTERNAL_NAME(sh_paging_mode, GUEST_LEVELS);
+
+#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
+extern void
+SHADOW_INTERNAL_NAME(sh_resync_l1, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t gmfn);
+
+extern int
+SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, GUEST_LEVELS)
+ (struct vcpu*v, mfn_t gmfn);
+#endif
diff -r 26ecd1f9e128 xen/arch/x86/mm/shadow/private.h
--- a/xen/arch/x86/mm/shadow/private.h Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/arch/x86/mm/shadow/private.h Fri Jun 20 15:10:08 2008 +0100
@@ -63,8 +63,9 @@ extern int shadow_audit_enable;
#define SHOPT_SKIP_VERIFY 0x20 /* Skip PTE v'fy when safe to do so */
#define SHOPT_VIRTUAL_TLB 0x40 /* Cache guest v->p translations */
#define SHOPT_FAST_EMULATION 0x80 /* Fast write emulation */
+#define SHOPT_OUT_OF_SYNC 0x100 /* Allow guest writes to L1 PTs */
-#define SHADOW_OPTIMIZATIONS 0xff
+#define SHADOW_OPTIMIZATIONS 0x1ff
/******************************************************************************
@@ -301,6 +302,62 @@ static inline int sh_type_is_pinnable(st
#define SHF_PAE (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE|SHF_L2H_PAE)
#define SHF_64 (SHF_L1_64|SHF_FL1_64|SHF_L2_64|SHF_L2H_64|SHF_L3_64|SHF_L4_64)
+#define SHF_L1_ANY (SHF_L1_32|SHF_L1_PAE|SHF_L1_64)
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+/* Marks a guest L1 page table which is shadowed but not write-protected.
+ * If set, then *only* L1 shadows (SHF_L1_*) are allowed.
+ *
+ * out_of_sync indicates that the shadow tables may not reflect the
+ * guest tables. If it is clear, then the shadow tables *must* reflect
+ * the guest tables.
+ *
+ * oos_may_write indicates that a page may have writable mappings.
+ *
+ * Most of the time the flags are synonymous. There is a short period of time
+ * during resync that oos_may_write is clear but out_of_sync is not. If a
+ * codepath is called during that time and is sensitive to oos issues, it may
+ * need to use the second flag.
+ */
+#define SHF_out_of_sync (1u<<30)
+#define SHF_oos_may_write (1u<<29)
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
+
+static inline int sh_page_has_multiple_shadows(struct page_info *pg)
+{
+ u32 shadows;
+ if ( !(pg->count_info & PGC_page_table) )
+ return 0;
+ shadows = pg->shadow_flags & SHF_page_type_mask;
+ /* More than one type bit set in shadow-flags? */
+ return ( (shadows & ~(1UL << find_first_set_bit(shadows))) != 0 );
+}
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+/* The caller must verify this is reasonable to call; i.e., valid mfn,
+ * domain is translated, &c */
+static inline int page_is_out_of_sync(struct page_info *p)
+{
+ return (p->count_info & PGC_page_table)
+ && (p->shadow_flags & SHF_out_of_sync);
+}
+
+static inline int mfn_is_out_of_sync(mfn_t gmfn)
+{
+ return page_is_out_of_sync(mfn_to_page(mfn_x(gmfn)));
+}
+
+static inline int page_oos_may_write(struct page_info *p)
+{
+ return (p->count_info & PGC_page_table)
+ && (p->shadow_flags & SHF_oos_may_write);
+}
+
+static inline int mfn_oos_may_write(mfn_t gmfn)
+{
+ return page_oos_may_write(mfn_to_page(mfn_x(gmfn)));
+}
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
/******************************************************************************
* Various function declarations
@@ -351,7 +408,50 @@ int shadow_cmpxchg_guest_entry(struct vc
int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
intpte_t *old, intpte_t new, mfn_t gmfn);
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+/* Allow a shadowed page to go out of sync */
+int sh_unsync(struct vcpu *v, mfn_t gmfn, unsigned long va);
+/* Pull an out-of-sync page back into sync. */
+void sh_resync(struct vcpu *v, mfn_t gmfn);
+
+/* Pull all out-of-sync shadows back into sync. If skip != 0, we try
+ * to avoid resyncing where we think we can get away with it. */
+
+void sh_resync_all(struct vcpu *v, int skip, int this, int others, int
do_locking);
+
+static inline void
+shadow_resync_all(struct vcpu *v, int do_locking)
+{
+ sh_resync_all(v,
+ 0 /* skip */,
+ 1 /* this */,
+ 1 /* others */,
+ do_locking);
+}
+
+static inline void
+shadow_resync_current_vcpu(struct vcpu *v, int do_locking)
+{
+ sh_resync_all(v,
+ 0 /* skip */,
+ 1 /* this */,
+ 0 /* others */,
+ do_locking);
+}
+
+static inline void
+shadow_sync_other_vcpus(struct vcpu *v, int do_locking)
+{
+ sh_resync_all(v,
+ 1 /* skip */,
+ 0 /* this */,
+ 1 /* others */,
+ do_locking);
+}
+
+void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn);
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
/******************************************************************************
* Flags used in the return value of the shadow_set_lXe() functions...
diff -r 26ecd1f9e128 xen/arch/x86/mm/shadow/types.h
--- a/xen/arch/x86/mm/shadow/types.h Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/arch/x86/mm/shadow/types.h Fri Jun 20 15:10:08 2008 +0100
@@ -438,6 +438,10 @@ struct shadow_walk_t
#define sh_guess_wrmap INTERNAL_NAME(sh_guess_wrmap)
#define sh_clear_shadow_entry INTERNAL_NAME(sh_clear_shadow_entry)
+#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
+#define sh_resync_l1 INTERNAL_NAME(sh_resync_l1)
+#define sh_safe_not_to_sync INTERNAL_NAME(sh_safe_not_to_sync)
+#endif
/* The sh_guest_(map|get)_* functions depends on Xen's paging levels */
#define sh_guest_map_l1e \
diff -r 26ecd1f9e128 xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/include/asm-x86/domain.h Fri Jun 20 15:10:08 2008 +0100
@@ -103,6 +103,9 @@ struct shadow_domain {
* emulation and remove write permission
*/
atomic_t gtable_dirty_version;
+
+ /* OOS */
+ int oos_active;
};
struct shadow_vcpu {
@@ -122,6 +125,10 @@ struct shadow_vcpu {
unsigned long last_emulated_frame;
/* Last MFN that we emulated a write successfully */
unsigned long last_emulated_mfn;
+
+ /* Shadow out-of-sync: pages that this vcpu has let go out of sync */
+ mfn_t oos[SHADOW_OOS_PAGES];
+ unsigned long oos_va[SHADOW_OOS_PAGES];
};
/************************************************/
diff -r 26ecd1f9e128 xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/include/asm-x86/mm.h Fri Jun 20 15:10:08 2008 +0100
@@ -130,6 +130,9 @@ static inline u32 pickle_domptr(struct d
/* The order of the largest allocation unit we use for shadow pages */
#define SHADOW_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */
+/* The number of out-of-sync shadows we allow per vcpu (prime, please) */
+#define SHADOW_OOS_PAGES 7
+
#define page_get_owner(_p) (unpickle_domptr((_p)->u.inuse._domain))
#define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d))
diff -r 26ecd1f9e128 xen/include/asm-x86/perfc_defn.h
--- a/xen/include/asm-x86/perfc_defn.h Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/include/asm-x86/perfc_defn.h Fri Jun 20 15:10:08 2008 +0100
@@ -80,6 +80,7 @@ PERFCOUNTER(shadow_writeable_h_3, "shad
PERFCOUNTER(shadow_writeable_h_3, "shadow writeable: 64b w2k3")
PERFCOUNTER(shadow_writeable_h_4, "shadow writeable: linux low/solaris")
PERFCOUNTER(shadow_writeable_h_5, "shadow writeable: linux high")
+PERFCOUNTER(shadow_writeable_h_6, "shadow writeable: unsync va")
PERFCOUNTER(shadow_writeable_bf, "shadow writeable brute-force")
PERFCOUNTER(shadow_mappings, "shadow removes all mappings")
PERFCOUNTER(shadow_mappings_bf, "shadow rm-mappings brute-force")
@@ -101,4 +102,8 @@ PERFCOUNTER(shadow_em_ex_non_pt, "shad
PERFCOUNTER(shadow_em_ex_non_pt, "shadow extra non-pt-write op")
PERFCOUNTER(shadow_em_ex_fail, "shadow extra emulation failed")
+PERFCOUNTER(shadow_unsync, "shadow OOS unsyncs")
+PERFCOUNTER(shadow_unsync_evict, "shadow OOS evictions")
+PERFCOUNTER(shadow_resync, "shadow OOS resyncs")
+
/*#endif*/ /* __XEN_PERFC_DEFN_H__ */
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|