WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH 1/4] Out-of-sync L1 shadows: OOS base

To: xen-devel@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-devel] [PATCH 1/4] Out-of-sync L1 shadows: OOS base
From: Gianluca Guida <gianluca.guida@xxxxxxxxxxxxx>
Date: Fri, 20 Jun 2008 18:31:50 +0100
Delivery-date: Fri, 20 Jun 2008 10:32:35 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
User-agent: Mozilla-Thunderbird 2.0.0.9 (X11/20080110)
This patch implements the basic mechanisms to get pagetables out of sync and back in sync again.

Signed-off-by: Gianluca Guida <gianluca.guida@xxxxxxxxxxxxx>
Signed-off-by: Tim Deegan <tim.deegan@xxxxxxxxxxxxx>
Signed-off-by: George Dunlap <george.dunlap@xxxxxxxxxxxxx>
diff -r 26ecd1f9e128 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/arch/x86/mm.c Fri Jun 20 15:10:08 2008 +0100
@@ -1933,9 +1933,15 @@ int get_page_type(struct page_info *page
         {
             struct domain *d = page_get_owner(page);
 
-            /* Never allow a shadowed frame to go from type count 0 to 1 */
-            if ( d && shadow_mode_enabled(d) )
-                shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
+            /* Normally we should never let a page go from type count 0
+             * to type count 1 when it is shadowed. One exception:
+             * out-of-sync shadowed pages are allowed to become
+             * writeable. */
+            if ( d && shadow_mode_enabled(d)
+                 && (page->count_info & PGC_page_table)
+                 && !((page->shadow_flags & (1u<<29))
+                      && type == PGT_writable_page) )
+               shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
 
             ASSERT(!(x & PGT_pae_xen_l2));
             if ( (x & PGT_type_mask) != type )
diff -r 26ecd1f9e128 xen/arch/x86/mm/shadow/common.c
--- a/xen/arch/x86/mm/shadow/common.c   Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/arch/x86/mm/shadow/common.c   Fri Jun 20 15:10:08 2008 +0100
@@ -54,6 +54,10 @@ void shadow_domain_init(struct domain *d
     /* Use shadow pagetables for log-dirty support */
     paging_log_dirty_init(d, shadow_enable_log_dirty, 
                           shadow_disable_log_dirty, shadow_clean_dirty_bitmap);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    d->arch.paging.shadow.oos_active = 0;
+#endif
 }
 
 /* Setup the shadow-specfic parts of a vcpu struct. Note: The most important
@@ -64,6 +68,13 @@ void shadow_domain_init(struct domain *d
  */
 void shadow_vcpu_init(struct vcpu *v)
 {
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    int i;
+
+    for ( i = 0; i < SHADOW_OOS_PAGES; i++ )
+        v->arch.paging.shadow.oos[i] = _mfn(INVALID_MFN);
+#endif
+
     v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
 }
 
@@ -427,6 +438,404 @@ void shadow_continue_emulation(struct sh
         }
     }
 }
+ 
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+/**************************************************************************/
+/* Out-of-sync shadows. */ 
+
+/* From time to time, we let a shadowed pagetable page go out of sync 
+ * with its shadow: the guest is allowed to write directly to the page, 
+ * and those writes are not synchronously reflected in the shadow.
+ * This lets us avoid many emulations if the guest is writing a lot to a 
+ * pagetable, but it relaxes a pretty important invariant in the shadow 
+ * pagetable design.  Therefore, some rules:
+ *
+ * 1. Only L1 pagetables may go out of sync: any page that is shadowed
+ *    at at higher level must be synchronously updated.  This makes
+ *    using linear shadow pagetables much less dangerous.
+ *    That means that: (a) unsyncing code needs to check for higher-level
+ *    shadows, and (b) promotion code needs to resync.
+ * 
+ * 2. All shadow operations on a guest page require the page to be brought
+ *    back into sync before proceeding.  This must be done under the
+ *    shadow lock so that the page is guaranteed to remain synced until
+ *    the operation completes.
+ *
+ *    Exceptions to this rule: the pagefault and invlpg handlers may 
+ *    update only one entry on an out-of-sync page without resyncing it. 
+ *
+ * 3. Operations on shadows that do not start from a guest page need to
+ *    be aware that they may be handling an out-of-sync shadow.
+ *
+ * 4. Operations that do not normally take the shadow lock (fast-path 
+ *    #PF handler, INVLPG) must fall back to a locking, syncing version 
+ *    if they see an out-of-sync table. 
+ *
+ * 5. Operations corresponding to guest TLB flushes (MOV CR3, INVLPG)
+ *    must explicitly resync all relevant pages or update their
+ *    shadows.
+ *
+ * Currently out-of-sync pages are listed in a simple open-addressed
+ * hash table with a second chance (must resist temptation to radically
+ * over-engineer hash tables...)  The virtual address of the access
+ * which caused us to unsync the page is also kept in the hash table, as
+ * a hint for finding the writable mappings later.
+ *
+ * We keep a hash per vcpu, because we want as much as possible to do
+ * the re-sync on the save vcpu we did the unsync on, so the VA hint
+ * will be valid.
+ */
+
+
+#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
+static void sh_oos_audit(struct domain *d) 
+{
+    int idx, expected_idx, expected_idx_alt;
+    struct page_info *pg;
+    struct vcpu *v;
+    
+    for_each_vcpu(d, v) 
+    {
+        for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
+        {
+            mfn_t *oos = v->arch.paging.shadow.oos;
+            if ( !mfn_valid(oos[idx]) )
+                continue;
+            
+            expected_idx = mfn_x(oos[idx]) % SHADOW_OOS_PAGES;
+            expected_idx_alt = ((expected_idx + 1) % SHADOW_OOS_PAGES);
+            if ( idx != expected_idx && idx != expected_idx_alt )
+            {
+                printk("%s: idx %d contains gmfn %lx, expected at %d or %d.\n",
+                       __func__, idx, mfn_x(oos[idx]), 
+                       expected_idx, expected_idx_alt);
+                BUG();
+            }
+            pg = mfn_to_page(oos[idx]);
+            if ( !(pg->count_info & PGC_page_table) )
+            {
+                printk("%s: idx %x gmfn %lx not a pt (count %"PRIx32")\n",
+                       __func__, idx, mfn_x(oos[idx]), pg->count_info);
+                BUG();
+            }
+            if ( !(pg->shadow_flags & SHF_out_of_sync) )
+            {
+                printk("%s: idx %x gmfn %lx not marked oos (flags %lx)\n",
+                       __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
+                BUG();
+            }
+            if ( (pg->shadow_flags & SHF_page_type_mask & ~SHF_L1_ANY) )
+            {
+                printk("%s: idx %x gmfn %lx shadowed as non-l1 (flags %lx)\n",
+                       __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
+                BUG();
+            }
+        }
+    }
+}
+#endif
+
+#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
+void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn) 
+{
+    int idx;
+    struct vcpu *v;
+    mfn_t *oos;
+
+    ASSERT(mfn_is_out_of_sync(gmfn));
+    
+    for_each_vcpu(d, v) 
+    {
+        oos = v->arch.paging.shadow.oos;
+        idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+        if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
+            idx = (idx + 1) % SHADOW_OOS_PAGES;
+        
+        if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
+            return;
+    }
+
+    SHADOW_ERROR("gmfn %lx marked OOS but not in hash table\n", mfn_x(gmfn));
+    BUG();
+}
+#endif
+
+/* Update the shadow, but keep the page out of sync. */
+static inline void _sh_resync_l1(struct vcpu *v, mfn_t gmfn)
+{
+    struct page_info *pg = mfn_to_page(gmfn);
+
+    ASSERT(mfn_valid(gmfn));
+    ASSERT(page_is_out_of_sync(pg));
+
+    /* Call out to the appropriate per-mode resyncing function */
+    if ( pg->shadow_flags & SHF_L1_32 )
+        SHADOW_INTERNAL_NAME(sh_resync_l1, 2)(v, gmfn);
+    else if ( pg->shadow_flags & SHF_L1_PAE )
+        SHADOW_INTERNAL_NAME(sh_resync_l1, 3)(v, gmfn);
+#if CONFIG_PAGING_LEVELS >= 4
+    else if ( pg->shadow_flags & SHF_L1_64 )
+        SHADOW_INTERNAL_NAME(sh_resync_l1, 4)(v, gmfn);
+#endif
+}
+
+/* Pull all the entries on an out-of-sync page back into sync. */
+static void _sh_resync(struct vcpu *v, mfn_t gmfn, unsigned long va)
+{
+    struct page_info *pg = mfn_to_page(gmfn);
+
+    ASSERT(shadow_locked_by_me(v->domain));
+    ASSERT(mfn_is_out_of_sync(gmfn));
+    /* Guest page must be shadowed *only* as L1 when out of sync. */
+    ASSERT(!(mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask 
+             & ~SHF_L1_ANY));
+    ASSERT(!sh_page_has_multiple_shadows(mfn_to_page(gmfn)));
+
+    SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, va=%lx\n",
+                  v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va);
+
+    /* Need to pull write access so the page *stays* in sync. 
+     * This might be rather slow but we hope that in the common case 
+     * we're handling this pagetable after a guest walk has pulled 
+     * write access the fast way. */
+    switch ( sh_remove_write_access(v, gmfn, 0, va) )
+    {
+    default:
+    case 0:
+        break;
+
+    case 1:
+        flush_tlb_mask(v->domain->domain_dirty_cpumask);
+        break;
+
+    case -1:
+        /* An unfindable writeable typecount has appeared, probably via a
+         * grant table entry: can't shoot the mapping, so try to unshadow 
+         * the page.  If that doesn't work either, the guest is granting
+         * his pagetables and must be killed after all. */
+        sh_remove_shadows(v, gmfn, 0 /* Be thorough */, 1 /* Must succeed */);
+        return;
+    }
+
+    /* No more writable mappings of this page, please */
+    pg->shadow_flags &= ~SHF_oos_may_write;
+
+    /* Update the shadows with current guest entries. */
+    _sh_resync_l1(v, gmfn);
+
+    /* Now we know all the entries are synced, and will stay that way */
+    pg->shadow_flags &= ~SHF_out_of_sync;
+    perfc_incr(shadow_resync);
+}
+
+
+/* Add an MFN to the list of out-of-sync guest pagetables */
+static void oos_hash_add(struct vcpu *v, mfn_t gmfn, unsigned long va)
+{
+    int idx;
+    mfn_t *oos = v->arch.paging.shadow.oos;
+    unsigned long *oos_va = v->arch.paging.shadow.oos_va;
+
+    idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+    if ( mfn_valid(oos[idx]) 
+         && (mfn_x(oos[idx]) % SHADOW_OOS_PAGES) == idx )
+    {
+        /* Punt the current occupant into the next slot */
+        SWAP(oos[idx], gmfn);
+        SWAP(oos_va[idx], va);
+        idx = (idx + 1) % SHADOW_OOS_PAGES;
+    }
+    if ( mfn_valid(oos[idx]) )
+   {
+        /* Crush the current occupant. */
+        _sh_resync(v, oos[idx], oos_va[idx]);
+        perfc_incr(shadow_unsync_evict);
+    }
+    oos[idx] = gmfn;
+    oos_va[idx] = va;
+}
+
+/* Remove an MFN from the list of out-of-sync guest pagetables */
+static void oos_hash_remove(struct vcpu *v, mfn_t gmfn)
+{
+    int idx;
+    mfn_t *oos;
+    struct domain *d = v->domain;
+
+    SHADOW_PRINTK("D%dV%d gmfn %lx\n",
+                  v->domain->domain_id, v->vcpu_id, mfn_x(gmfn)); 
+
+    for_each_vcpu(d, v) 
+    {
+        oos = v->arch.paging.shadow.oos;
+        idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+        if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
+            idx = (idx + 1) % SHADOW_OOS_PAGES;
+        if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
+        {
+            oos[idx] = _mfn(INVALID_MFN);
+            return;
+        }
+    }
+
+    SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
+    BUG();
+}
+
+/* Pull a single guest page back into sync */
+void sh_resync(struct vcpu *v, mfn_t gmfn)
+{
+    int idx;
+    mfn_t *oos;
+    unsigned long *oos_va;
+    struct domain *d = v->domain;
+
+    for_each_vcpu(d, v) 
+    {
+        oos = v->arch.paging.shadow.oos;
+        oos_va = v->arch.paging.shadow.oos_va;
+        idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+        if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
+            idx = (idx + 1) % SHADOW_OOS_PAGES;
+        
+        if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
+        {
+            _sh_resync(v, gmfn, oos_va[idx]);
+            oos[idx] = _mfn(INVALID_MFN);
+            return;
+        }
+    }
+
+    SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
+    BUG();
+}
+
+/* Figure out whether it's definitely safe not to sync this l1 table,
+ * by making a call out to the mode in which that shadow was made. */
+static int sh_skip_sync(struct vcpu *v, mfn_t gl1mfn)
+{
+    struct page_info *pg = mfn_to_page(gl1mfn);
+    if ( pg->shadow_flags & SHF_L1_32 )
+        return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 2)(v, gl1mfn);
+    else if ( pg->shadow_flags & SHF_L1_PAE )
+        return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 3)(v, gl1mfn);
+#if CONFIG_PAGING_LEVELS >= 4
+    else if ( pg->shadow_flags & SHF_L1_64 )
+        return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 4)(v, gl1mfn);
+#endif
+    SHADOW_ERROR("gmfn 0x%lx was OOS but not shadowed as an l1.\n", 
+                 mfn_x(gl1mfn));
+    BUG();
+    return 0; /* BUG() is no longer __attribute__((noreturn)). */
+}
+
+
+/* Pull all out-of-sync pages back into sync.  Pages brought out of sync
+ * on other vcpus are allowed to remain out of sync, but their contents
+ * will be made safe (TLB flush semantics); pages unsynced by this vcpu
+ * are brought back into sync and write-protected.  If skip != 0, we try
+ * to avoid resyncing at all if we think we can get away with it. */
+void sh_resync_all(struct vcpu *v, int skip, int this, int others, int 
do_locking)
+{
+    int idx;
+    struct vcpu *other;
+    mfn_t *oos = v->arch.paging.shadow.oos;
+    unsigned long *oos_va = v->arch.paging.shadow.oos_va;
+
+    SHADOW_PRINTK("d=%d, v=%d\n", v->domain->domain_id, v->vcpu_id);
+
+    ASSERT(do_locking || shadow_locked_by_me(v->domain));
+
+    if ( !this )
+        goto resync_others;
+
+    if ( do_locking )
+        shadow_lock(v->domain);
+
+    /* First: resync all of this vcpu's oos pages */
+    for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) 
+        if ( mfn_valid(oos[idx]) )
+        {
+            /* Write-protect and sync contents */
+            _sh_resync(v, oos[idx], oos_va[idx]);
+            oos[idx] = _mfn(INVALID_MFN);
+        }
+
+    if ( do_locking )
+        shadow_unlock(v->domain);
+
+ resync_others:
+    if ( !others )
+        return;
+
+    /* Second: make all *other* vcpus' oos pages safe. */
+    for_each_vcpu(v->domain, other)
+    {
+        if ( v == other ) 
+            continue;
+
+        if ( do_locking )
+            shadow_lock(v->domain);
+
+        oos = other->arch.paging.shadow.oos;
+        oos_va = other->arch.paging.shadow.oos_va;
+
+        for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) 
+        {
+            if ( !mfn_valid(oos[idx]) )
+                continue;
+
+            if ( skip )
+            {
+                /* Update the shadows and leave the page OOS. */
+                if ( sh_skip_sync(v, oos[idx]) )
+                    continue;
+                _sh_resync_l1(other, oos[idx]);
+            }
+            else
+            {
+                /* Write-protect and sync contents */
+                _sh_resync(other, oos[idx], oos_va[idx]);
+                oos[idx] = _mfn(INVALID_MFN);
+            }
+        }
+        
+        if ( do_locking )
+            shadow_unlock(v->domain);
+    }
+}
+
+/* Allow a shadowed page to go out of sync */
+int sh_unsync(struct vcpu *v, mfn_t gmfn, unsigned long va)
+{
+    struct page_info *pg;
+    
+    ASSERT(shadow_locked_by_me(v->domain));
+
+    SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx va %lx\n",
+                  v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va);
+
+    pg = mfn_to_page(gmfn);
+ 
+    /* Guest page must be shadowed *only* as L1 and *only* once when out
+     * of sync.  Also, get out now if it's already out of sync. 
+     * Also, can't safely unsync if some vcpus have paging disabled.*/
+    if ( pg->shadow_flags & 
+         ((SHF_page_type_mask & ~SHF_L1_ANY) | SHF_out_of_sync) 
+         || sh_page_has_multiple_shadows(pg)
+         || !is_hvm_domain(v->domain)
+         || !v->domain->arch.paging.shadow.oos_active )
+        return 0;
+
+    pg->shadow_flags |= SHF_out_of_sync|SHF_oos_may_write;
+    oos_hash_add(v, gmfn, va);
+    perfc_incr(shadow_unsync);
+    return 1;
+}
+
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
+
 
 /**************************************************************************/
 /* Code for "promoting" a guest page to the point where the shadow code is
@@ -439,6 +848,12 @@ void shadow_promote(struct vcpu *v, mfn_
     struct page_info *page = mfn_to_page(gmfn);
 
     ASSERT(mfn_valid(gmfn));
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+    /* Is the page already shadowed and out of sync? */
+    if ( page_is_out_of_sync(page) ) 
+        sh_resync(v, gmfn);
+#endif
 
     /* We should never try to promote a gmfn that has writeable mappings */
     ASSERT((page->u.inuse.type_info & PGT_type_mask) != PGT_writable_page
@@ -463,7 +878,14 @@ void shadow_demote(struct vcpu *v, mfn_t
     clear_bit(type, &page->shadow_flags);
 
     if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
+    {
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+        /* Was the page out of sync? */
+        if ( page_is_out_of_sync(page) ) 
+            oos_hash_remove(v, gmfn);
+#endif 
         clear_bit(_PGC_page_table, &page->count_info);
+    }
 }
 
 /**************************************************************************/
@@ -1297,6 +1719,27 @@ static void sh_hash_audit_bucket(struct 
             /* Bad shadow flags on guest page? */
             BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) );
             /* Bad type count on guest page? */
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+            if ( sp->type == SH_type_l1_32_shadow
+                 || sp->type == SH_type_l1_pae_shadow
+                 || sp->type == SH_type_l1_64_shadow )
+            {
+                if ( (gpg->u.inuse.type_info & PGT_type_mask) == 
PGT_writable_page
+                     && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
+                {
+                    if ( !page_is_out_of_sync(gpg) )
+                    {
+                        SHADOW_ERROR("MFN %#lx shadowed (by %#"PRI_mfn")"
+                                     " and not OOS but has typecount %#lx\n",
+                                     sp->backpointer, 
+                                     mfn_x(shadow_page_to_mfn(sp)), 
+                                     gpg->u.inuse.type_info);
+                        BUG();
+                    }
+                }
+            }
+            else /* Not an l1 */
+#endif
             if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page 
                  && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
             {
@@ -1608,7 +2051,8 @@ void sh_destroy_shadow(struct vcpu *v, m
 /* Remove all writeable mappings of a guest frame from the shadow tables 
  * Returns non-zero if we need to flush TLBs. 
  * level and fault_addr desribe how we found this to be a pagetable;
- * level==0 means we have some other reason for revoking write access.*/
+ * level==0 means we have some other reason for revoking write access.
+ * If level==0 we are allowed to fail, returning -1. */
 
 int sh_remove_write_access(struct vcpu *v, mfn_t gmfn, 
                            unsigned int level,
@@ -1659,7 +2103,12 @@ int sh_remove_write_access(struct vcpu *
         return 0;
 
     /* Early exit if it's already a pagetable, or otherwise not writeable */
-    if ( sh_mfn_is_a_page_table(gmfn) 
+    if ( (sh_mfn_is_a_page_table(gmfn)
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+         /* Unless they've been allowed to go out of sync with their shadows */
+           && !mfn_oos_may_write(gmfn)
+#endif
+         )
          || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
         return 0;
 
@@ -1676,7 +2125,7 @@ int sh_remove_write_access(struct vcpu *
     }
 
 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
-    if ( v == current && level != 0 )
+    if ( v == current )
     {
         unsigned long gfn;
         /* Heuristic: there is likely to be only one writeable mapping,
@@ -1690,6 +2139,8 @@ int sh_remove_write_access(struct vcpu *
                 return 1;                                                 \
         } while (0)
 
+        if ( level == 0 && fault_addr )
+            GUESS(fault_addr, 6);
         
         if ( v->arch.paging.mode->guest_levels == 2 )
         {
@@ -1780,6 +2231,9 @@ int sh_remove_write_access(struct vcpu *
      * mapping -- ioreq page, grant mapping, &c. */
     if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
     {
+        if ( level == 0 )
+            return -1;
+
         SHADOW_ERROR("can't remove write access to mfn %lx: guest has "
                       "%lu special-use mappings of it\n", mfn_x(gmfn),
                       (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
@@ -2159,6 +2613,13 @@ static void sh_update_paging_modes(struc
         ASSERT(shadow_mode_translate(d));
         ASSERT(shadow_mode_external(d));
 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+        /* Need to resync all our pages now, because if a page goes out
+         * of sync with paging enabled and is resynced with paging
+         * disabled, the resync will go wrong. */
+        shadow_resync_all(v, 0);
+#endif /* OOS */
+
         if ( !hvm_paging_enabled(v) )
         {
             /* When the guest has CR0.PG clear, we provide a 32-bit, non-PAE
@@ -2253,6 +2714,27 @@ static void sh_update_paging_modes(struc
         //        different values for CR4.PSE and CR4.PGE at the same time.
         //        This *does* happen, at least for CR4.PGE...
     }
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* We need to check that all the vcpus have paging enabled to
+     * unsync PTs. */
+    if ( is_hvm_domain(d) )
+    {
+        int pe = 1;
+        struct vcpu *vptr;
+
+        for_each_vcpu(d, vptr)
+        {
+            if ( !hvm_paging_enabled(vptr) )
+            {
+                pe = 0;
+                break;
+            }
+        }
+
+        d->arch.paging.shadow.oos_active = pe;
+    }
+#endif /* OOS */
 
     v->arch.paging.mode->update_cr3(v, 0);
 }
@@ -3044,7 +3526,11 @@ void shadow_audit_tables(struct vcpu *v)
 
     if ( !(SHADOW_AUDIT_ENABLE) )
         return;
-    
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    sh_oos_audit(v->domain);
+#endif
+
     if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
         mask = ~1; /* Audit every table in the system */
     else 
diff -r 26ecd1f9e128 xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c    Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/arch/x86/mm/shadow/multi.c    Fri Jun 20 15:10:08 2008 +0100
@@ -305,22 +305,54 @@ shadow_check_gwalk(struct vcpu *v, unsig
 }
 
 /* Remove write access permissions from a gwalk_t in a batch, and
- * return OR-ed result for TLB flush hint
- */
+ * return OR-ed result for TLB flush hint and need to rewalk the guest
+ * pages.
+ *
+ * Syncing pages will remove write access to that page; but it may
+ * also give write access to other pages in the path. If we resync any
+ * pages, re-walk from the beginning.
+ */
+#define GW_RMWR_FLUSHTLB 1
+#define GW_RMWR_REWALK   2
+
 static inline uint32_t
 gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
 {
-    int rc = 0;
+    uint32_t rc = 0;
 
 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
-    rc = sh_remove_write_access(v, gw->l3mfn, 3, va);
-#endif
-    rc |= sh_remove_write_access(v, gw->l2mfn, 2, va);
-#endif
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    if ( mfn_is_out_of_sync(gw->l3mfn) )
+    {
+        sh_resync(v, gw->l3mfn);
+        rc = GW_RMWR_REWALK;
+    }
+    else
+#endif /* OOS */
+     if ( sh_remove_write_access(v, gw->l3mfn, 3, va) )
+         rc = GW_RMWR_FLUSHTLB;
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    if ( mfn_is_out_of_sync(gw->l2mfn) )
+    {
+        sh_resync(v, gw->l2mfn);
+        rc |= GW_RMWR_REWALK;
+    }
+    else
+#endif /* OOS */
+    if ( sh_remove_write_access(v, gw->l2mfn, 2, va) )
+        rc |= GW_RMWR_FLUSHTLB;
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
     if ( !(guest_supports_superpages(v) &&
-           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
-        rc |= sh_remove_write_access(v, gw->l1mfn, 1, va);
+           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE))
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+         && !mfn_is_out_of_sync(gw->l1mfn)
+#endif /* OOS */
+         && sh_remove_write_access(v, gw->l1mfn, 1, va) )
+        rc |= GW_RMWR_FLUSHTLB;
 
     return rc;
 }
@@ -882,7 +914,12 @@ _sh_propagate(struct vcpu *v,
     
     // protect guest page tables
     //
-    if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn)) )
+    if ( unlikely((level == 1) 
+                  && sh_mfn_is_a_page_table(target_mfn)
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
+                  && !mfn_oos_may_write(target_mfn)
+#endif /* OOS */
+                  ) )
     {
         if ( shadow_mode_trap_reads(d) )
         {
@@ -1125,6 +1162,9 @@ static int shadow_set_l4e(struct vcpu *v
             domain_crash(v->domain);
             return SHADOW_SET_ERROR;
         }
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
+        shadow_resync_all(v, 0);
+#endif
     }
 
     /* Write the new entry */
@@ -1163,12 +1203,17 @@ static int shadow_set_l3e(struct vcpu *v
              | (((unsigned long)sl3e) & ~PAGE_MASK));
     
     if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
+    {
         /* About to install a new reference */        
         if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
         {
             domain_crash(v->domain);
             return SHADOW_SET_ERROR;
-        } 
+        }
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
+        shadow_resync_all(v, 0);
+#endif
+    }
 
     /* Write the new entry */
     shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
@@ -1219,12 +1264,29 @@ static int shadow_set_l2e(struct vcpu *v
              | (((unsigned long)sl2e) & ~PAGE_MASK));
 
     if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT ) 
+    {
+        mfn_t sl1mfn = shadow_l2e_get_mfn(new_sl2e);
+
         /* About to install a new reference */
-        if ( !sh_get_ref(v, shadow_l2e_get_mfn(new_sl2e), paddr) )
+        if ( !sh_get_ref(v, sl1mfn, paddr) )
         {
             domain_crash(v->domain);
             return SHADOW_SET_ERROR;
-        } 
+        }
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+        {
+            struct shadow_page_info *sp = mfn_to_shadow_page(sl1mfn);
+            mfn_t gl1mfn = _mfn(sp->backpointer);
+
+            /* If the shadow is a fl1 then the backpointer contains
+               the GFN instead of the GMFN, and it's definitely not
+               OOS. */
+            if ( (sp->type != SH_type_fl1_shadow) && mfn_valid(gl1mfn)
+                 && mfn_is_out_of_sync(gl1mfn) )
+                sh_resync(v, gl1mfn);
+        }
+#endif
+    }
 
     /* Write the new entry */
 #if GUEST_PAGING_LEVELS == 2
@@ -2544,6 +2606,97 @@ static int validate_gl1e(struct vcpu *v,
     return result;
 }
 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+/**************************************************************************/
+/* Special validation function for re-syncing out-of-sync shadows. 
+ * Walks the *shadow* page, and for every entry that it finds,
+ * revalidates the guest entry that corresponds to it.
+ * N.B. This function is called with the vcpu that unsynced the page,
+ *      *not* the one that is causing it to be resynced. */
+void sh_resync_l1(struct vcpu *v, mfn_t gmfn)
+{
+    mfn_t sl1mfn;
+    shadow_l1e_t *sl1p;
+    guest_l1e_t *gl1p, *gp;
+    int rc = 0;
+
+    sl1mfn = get_shadow_status(v, gmfn, SH_type_l1_shadow);
+    ASSERT(mfn_valid(sl1mfn)); /* Otherwise we would not have been called */
+
+    gp = sh_map_domain_page(gmfn);
+    gl1p = gp;
+
+    SHADOW_FOREACH_L1E(sl1mfn, sl1p, &gl1p, 0, {
+        rc |= validate_gl1e(v, gl1p, sl1mfn, sl1p);
+    });
+
+    sh_unmap_domain_page(gp);
+
+    /* Setting shadow L1 entries should never need us to flush the TLB */
+    ASSERT(!(rc & SHADOW_SET_FLUSH));
+}
+
+/* Figure out whether it's definitely safe not to sync this l1 table. 
+ * That is: if we can tell that it's only used once, and that the 
+ * toplevel shadow responsible is not one of ours. 
+ * N.B. This function is called with the vcpu that required the resync, 
+ *      *not* the one that originally unsynced the page, but it is
+ *      called in the *mode* of the vcpu that unsynced it.  Clear?  Good. */
+int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn)
+{
+    struct shadow_page_info *sp;
+    mfn_t smfn;
+
+    smfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
+    ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
+    
+    /* Up to l2 */
+    sp = mfn_to_shadow_page(smfn);
+    if ( sp->count != 1 || !sp->up )
+        return 0;
+    smfn = _mfn(sp->up >> PAGE_SHIFT);
+    ASSERT(mfn_valid(smfn));
+
+#if (SHADOW_PAGING_LEVELS == 4) 
+    /* up to l3 */
+    sp = mfn_to_shadow_page(smfn);
+    if ( sp->count != 1 || !sp->up )
+        return 0;
+    smfn = _mfn(sp->up >> PAGE_SHIFT);
+    ASSERT(mfn_valid(smfn));
+
+    /* up to l4 */
+    sp = mfn_to_shadow_page(smfn);
+    if ( sp->count != 1 
+         || sh_type_is_pinnable(v, SH_type_l3_64_shadow) || !sp->up )
+        return 0;
+    smfn = _mfn(sp->up >> PAGE_SHIFT);
+    ASSERT(mfn_valid(smfn));
+
+#if (GUEST_PAGING_LEVELS == 2)
+    /* In 2-on-3 shadow mode the up pointer contains the link to the
+     * shadow page, but the shadow_table contains only the first of the
+     * four pages that makes the PAE top shadow tables. */
+    smfn = _mfn(mfn_x(smfn) & ~0x3UL);
+#endif
+
+#endif
+
+    if ( pagetable_get_pfn(v->arch.shadow_table[0]) == mfn_x(smfn)
+#if (SHADOW_PAGING_LEVELS == 3) 
+         || pagetable_get_pfn(v->arch.shadow_table[1]) == mfn_x(smfn)
+         || pagetable_get_pfn(v->arch.shadow_table[2]) == mfn_x(smfn)
+         || pagetable_get_pfn(v->arch.shadow_table[3]) == mfn_x(smfn) 
+#endif
+        )
+        return 0;
+    
+    /* Only in use in one toplevel shadow, and it's not the one we're 
+     * running on */
+    return 1;
+}
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
+
 
 /**************************************************************************/
 /* Functions which translate and install the shadows of arbitrary guest 
@@ -2805,6 +2958,7 @@ static int sh_page_fault(struct vcpu *v,
     int r;
     fetch_type_t ft = 0;
     p2m_type_t p2mt;
+    uint32_t rc;
 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
     int fast_emul = 0;
 #endif
@@ -2830,6 +2984,17 @@ static int sh_page_fault(struct vcpu *v,
         {
             fast_emul = 1;
             gmfn = _mfn(v->arch.paging.shadow.last_emulated_mfn);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+            /* Fall back to the slow path if we're trying to emulate
+               writes to an out of sync page. */
+            if ( mfn_valid(gmfn) && mfn_is_out_of_sync(gmfn) )
+            {
+                v->arch.paging.last_write_emul_ok = 0;
+                goto page_fault_slow_path;
+            }
+#endif /* OOS */
+
             perfc_incr(shadow_fault_fast_emulate);
             goto early_emulation;
         }
@@ -2855,6 +3020,31 @@ static int sh_page_fault(struct vcpu *v,
                                       sizeof(sl1e)) == 0)
                     && sh_l1e_is_magic(sl1e)) )
         {
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+             /* First, need to check that this isn't an out-of-sync
+              * shadow l1e.  If it is, we fall back to the slow path, which
+              * will sync it up again. */
+            {
+                shadow_l2e_t sl2e;
+                mfn_t gl1mfn;
+               if ( (__copy_from_user(&sl2e,
+                                       (sh_linear_l2_table(v)
+                                        + shadow_l2_linear_offset(va)),
+                                       sizeof(sl2e)) != 0)
+                     || !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT)
+                     || !mfn_valid(gl1mfn = _mfn(mfn_to_shadow_page(
+                                      shadow_l2e_get_mfn(sl2e))->backpointer))
+                     || unlikely(mfn_is_out_of_sync(gl1mfn)) )
+               {
+                   /* Hit the slow path as if there had been no 
+                    * shadow entry at all, and let it tidy up */
+                   ASSERT(regs->error_code & PFEC_page_present);
+                   regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
+                   goto page_fault_slow_path;
+               }
+            }
+#endif /* SHOPT_OUT_OF_SYNC */
+
             if ( sh_l1e_is_gnp(sl1e) )
             {
                 /* Not-present in a guest PT: pass to the guest as
@@ -2890,6 +3080,10 @@ static int sh_page_fault(struct vcpu *v,
             return EXCRET_fault_fixed;
         }
     }
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+ page_fault_slow_path:
+#endif
 #endif /* SHOPT_FAST_FAULT_PATH */
 
     /* Detect if this page fault happened while we were already in Xen
@@ -2904,7 +3098,21 @@ static int sh_page_fault(struct vcpu *v,
         return 0;
     }
 
-    if ( guest_walk_tables(v, va, &gw, regs->error_code) != 0 )
+ rewalk:
+    rc = guest_walk_tables(v, va, &gw, regs->error_code);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    if ( !(rc & _PAGE_PRESENT) )
+        regs->error_code |= PFEC_page_present;
+    else if ( regs->error_code & PFEC_page_present )
+    {
+            SHADOW_ERROR("OOS paranoia: Something is wrong in guest TLB"
+                         " flushing. Have fun debugging it.\n");
+            regs->error_code &= ~PFEC_page_present;
+    }
+#endif
+
+    if ( rc != 0 )
     {
         perfc_incr(shadow_fault_bail_real_fault);
         SHADOW_PRINTK("not a shadow fault\n");
@@ -2948,7 +3156,10 @@ static int sh_page_fault(struct vcpu *v,
 
     shadow_lock(d);
 
-    if ( gw_remove_write_accesses(v, va, &gw) )
+    rc = gw_remove_write_accesses(v, va, &gw);
+
+    /* First bit set: Removed write access to a page. */
+    if ( rc & GW_RMWR_FLUSHTLB )
     {
         /* Write permission removal is also a hint that other gwalks
          * overlapping with this one may be inconsistent
@@ -2958,11 +3169,20 @@ static int sh_page_fault(struct vcpu *v,
         flush_tlb_mask(d->domain_dirty_cpumask);
     }
 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* Second bit set: Resynced a page. Re-walk needed. */
+    if ( rc & GW_RMWR_REWALK )
+    {
+        shadow_unlock(d);
+        goto rewalk;
+    }
+#endif /* OOS */
+
     if ( !shadow_check_gwalk(v, va, &gw) )
     {
         perfc_incr(shadow_inconsistent_gwalk);
         shadow_unlock(d);
-        return EXCRET_fault_fixed;
+        goto rewalk;
     }
 
     shadow_audit_tables(v);
@@ -3001,7 +3221,12 @@ static int sh_page_fault(struct vcpu *v,
 #endif
 
     /* Need to emulate accesses to page tables */
-    if ( sh_mfn_is_a_page_table(gmfn) )
+    if ( sh_mfn_is_a_page_table(gmfn)
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+         /* Unless they've been allowed to go out of sync with their shadows */
+         && !mfn_is_out_of_sync(gmfn)
+#endif
+         )
     {
         if ( ft == ft_demand_write )
         {
@@ -3215,6 +3440,7 @@ sh_invlpg(struct vcpu *v, unsigned long 
  * instruction should be issued on the hardware, or 0 if it's safe not
  * to do so. */
 {
+    mfn_t sl1mfn;
     shadow_l2e_t sl2e;
     
     perfc_incr(shadow_invlpg);
@@ -3278,12 +3504,64 @@ sh_invlpg(struct vcpu *v, unsigned long 
     // If so, then we'll need to flush the entire TLB (because that's
     // easier than invalidating all of the individual 4K pages).
     //
-    if ( mfn_to_shadow_page(shadow_l2e_get_mfn(sl2e))->type
+    sl1mfn = shadow_l2e_get_mfn(sl2e);
+    if ( mfn_to_shadow_page(sl1mfn)->type
          == SH_type_fl1_shadow )
     {
         flush_tlb_local();
         return 0;
     }
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+    /* Check to see if the SL1 is out of sync. */
+    {
+        mfn_t gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+        struct page_info *pg = mfn_to_page(gl1mfn);
+        if ( mfn_valid(gl1mfn) 
+             && page_is_out_of_sync(pg) )
+        {
+            /* The test above may give false positives, since we don't
+             * hold the shadow lock yet.  Check again with the lock held. */
+            shadow_lock(v->domain);
+
+            /* This must still be a copy-from-user because we didn't
+             * have the shadow lock last time we checked, and the
+             * higher-level shadows might have disappeared under our
+             * feet. */
+            if ( __copy_from_user(&sl2e, 
+                                  sh_linear_l2_table(v)
+                                  + shadow_l2_linear_offset(va),
+                                  sizeof (sl2e)) != 0 )
+            {
+                perfc_incr(shadow_invlpg_fault);
+                shadow_unlock(v->domain);
+                return 0;
+            }
+
+            if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
+            {
+                shadow_unlock(v->domain);
+                return 0;
+            }
+
+            sl1mfn = shadow_l2e_get_mfn(sl2e);
+            gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+            pg = mfn_to_page(gl1mfn);
+            
+            if ( likely(sh_mfn_is_a_page_table(gl1mfn)
+                        && page_is_out_of_sync(pg) ) )
+            {
+                shadow_l1e_t *sl1;
+                sl1 = sh_linear_l1_table(v) + shadow_l1_linear_offset(va);
+                /* Remove the shadow entry that maps this VA */
+                (void) shadow_set_l1e(v, sl1, shadow_l1e_empty(), sl1mfn);
+            }
+            shadow_unlock(v->domain);
+            /* Need the invlpg, to pick up the disappeareance of the sl1e */
+            return 1;
+        }
+    }
+#endif
 
     return 1;
 }
@@ -3709,6 +3987,13 @@ sh_update_cr3(struct vcpu *v, int do_loc
         ASSERT(v->arch.cr3 == 0);
         return;
     }
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* Need to resync all the shadow entries on a TLB flush.  Resync
+     * current vcpus OOS pages before switching to the new shadow
+     * tables so that the VA hint is still valid.  */
+    shadow_resync_current_vcpu(v, do_locking);
+#endif
 
     if ( do_locking ) shadow_lock(v->domain);
 
@@ -3938,6 +4223,15 @@ sh_update_cr3(struct vcpu *v, int do_loc
 
     /* Release the lock, if we took it (otherwise it's the caller's problem) */
     if ( do_locking ) shadow_unlock(v->domain);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* Need to resync all the shadow entries on a TLB flush. We only
+     * update the shadows, leaving the pages out of sync. Also, we try
+     * to skip synchronization of shadows not mapped in the new
+     * tables. */
+    shadow_sync_other_vcpus(v, do_locking);
+#endif
+
 }
 
 
@@ -4437,23 +4731,35 @@ sh_x86_emulate_cmpxchg8b(struct vcpu *v,
 
 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
 
-#define AUDIT_FAIL(_level, _fmt, _a...) do {                               \
-    printk("Shadow %u-on-%u audit failed at level %i, index %i\n"         \
-           "gl" #_level "mfn = %" PRI_mfn                              \
-           " sl" #_level "mfn = %" PRI_mfn                             \
-           " &gl" #_level "e = %p &sl" #_level "e = %p"                    \
-           " gl" #_level "e = %" SH_PRI_gpte                              \
-           " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n",        \
-           GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS,                      \
-           _level, guest_index(gl ## _level ## e),                         \
-           mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn),         \
-           gl ## _level ## e, sl ## _level ## e,                           \
-           gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
-           ##_a);                                                          \
-    BUG();                                                                 \
-    done = 1;                                                              \
-} while (0)
-
+#define AUDIT_FAIL(_level, _fmt, _a...) do {                            \
+    printk("Shadow %u-on-%u audit failed at level %i, index %i\n"       \
+           "gl" #_level "mfn = %" PRI_mfn                               \
+           " sl" #_level "mfn = %" PRI_mfn                              \
+           " &gl" #_level "e = %p &sl" #_level "e = %p"                 \
+           " gl" #_level "e = %" SH_PRI_gpte                            \
+           " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n",      \
+           GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS,                   \
+               _level, guest_index(gl ## _level ## e),                  \
+               mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn),  \
+               gl ## _level ## e, sl ## _level ## e,                    \
+               gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, 
\
+               ##_a);                                                   \
+        BUG();                                                          \
+        done = 1;                                                       \
+} while (0)
+
+#define AUDIT_FAIL_MIN(_level, _fmt, _a...) do {                        \
+    printk("Shadow %u-on-%u audit failed at level %i\n"                 \
+           "gl" #_level "mfn = %" PRI_mfn                               \
+           " sl" #_level "mfn = %" PRI_mfn                              \
+           " Error: " _fmt "\n",                                        \
+           GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS,                   \
+           _level,                                                      \
+           mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn),      \
+           ##_a);                                                       \
+    BUG();                                                              \
+    done = 1;                                                           \
+} while (0)
 
 static char * sh_audit_flags(struct vcpu *v, int level,
                               int gflags, int sflags) 
@@ -4494,6 +4800,16 @@ int sh_audit_l1_table(struct vcpu *v, mf
     
     /* Follow the backpointer */
     gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* Out-of-sync l1 shadows can contain anything: just check the OOS hash */
+    if ( page_is_out_of_sync(mfn_to_page(gl1mfn)) )
+    {
+        oos_audit_hash_is_present(v->domain, gl1mfn);
+        return 0;
+    }
+#endif
+
     gl1e = gp = sh_map_domain_page(gl1mfn);
     SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
 
@@ -4574,6 +4890,13 @@ int sh_audit_l2_table(struct vcpu *v, mf
 
     /* Follow the backpointer */
     gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* Only L1's may be out of sync. */
+    if ( page_is_out_of_sync(mfn_to_page(gl2mfn)) )
+        AUDIT_FAIL_MIN(2, "gmfn %lx is out of sync", mfn_x(gl2mfn));
+#endif
+
     gl2e = gp = sh_map_domain_page(gl2mfn);
     SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, {
 
@@ -4616,6 +4939,13 @@ int sh_audit_l3_table(struct vcpu *v, mf
 
     /* Follow the backpointer */
     gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+    /* Only L1's may be out of sync. */
+    if ( page_is_out_of_sync(mfn_to_page(gl3mfn)) )
+        AUDIT_FAIL_MIN(3, "gmfn %lx is out of sync", mfn_x(gl3mfn));
+#endif
+
     gl3e = gp = sh_map_domain_page(gl3mfn);
     SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
 
@@ -4656,6 +4986,13 @@ int sh_audit_l4_table(struct vcpu *v, mf
 
     /* Follow the backpointer */
     gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+    /* Only L1's may be out of sync. */
+    if ( page_is_out_of_sync(mfn_to_page(gl4mfn)) )
+        AUDIT_FAIL_MIN(4, "gmfn %lx is out of sync", mfn_x(gl4mfn));
+#endif
+
     gl4e = gp = sh_map_domain_page(gl4mfn);
     SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain,
     {
diff -r 26ecd1f9e128 xen/arch/x86/mm/shadow/multi.h
--- a/xen/arch/x86/mm/shadow/multi.h    Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/arch/x86/mm/shadow/multi.h    Fri Jun 20 15:10:08 2008 +0100
@@ -115,3 +115,13 @@ SHADOW_INTERNAL_NAME(sh_destroy_monitor_
 
 extern struct paging_mode 
 SHADOW_INTERNAL_NAME(sh_paging_mode, GUEST_LEVELS);
+
+#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
+extern void 
+SHADOW_INTERNAL_NAME(sh_resync_l1, GUEST_LEVELS)
+    (struct vcpu *v, mfn_t gmfn);
+
+extern int
+SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, GUEST_LEVELS)
+     (struct vcpu*v, mfn_t gmfn);
+#endif
diff -r 26ecd1f9e128 xen/arch/x86/mm/shadow/private.h
--- a/xen/arch/x86/mm/shadow/private.h  Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/arch/x86/mm/shadow/private.h  Fri Jun 20 15:10:08 2008 +0100
@@ -63,8 +63,9 @@ extern int shadow_audit_enable;
 #define SHOPT_SKIP_VERIFY         0x20  /* Skip PTE v'fy when safe to do so */
 #define SHOPT_VIRTUAL_TLB         0x40  /* Cache guest v->p translations */
 #define SHOPT_FAST_EMULATION      0x80  /* Fast write emulation */
+#define SHOPT_OUT_OF_SYNC        0x100  /* Allow guest writes to L1 PTs */
 
-#define SHADOW_OPTIMIZATIONS      0xff
+#define SHADOW_OPTIMIZATIONS     0x1ff
 
 
 /******************************************************************************
@@ -301,6 +302,62 @@ static inline int sh_type_is_pinnable(st
 #define SHF_PAE (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE|SHF_L2H_PAE)
 #define SHF_64  (SHF_L1_64|SHF_FL1_64|SHF_L2_64|SHF_L2H_64|SHF_L3_64|SHF_L4_64)
 
+#define SHF_L1_ANY  (SHF_L1_32|SHF_L1_PAE|SHF_L1_64)
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+/* Marks a guest L1 page table which is shadowed but not write-protected.
+ * If set, then *only* L1 shadows (SHF_L1_*) are allowed. 
+ *
+ * out_of_sync indicates that the shadow tables may not reflect the
+ * guest tables.  If it is clear, then the shadow tables *must* reflect
+ * the guest tables.
+ *
+ * oos_may_write indicates that a page may have writable mappings.
+ *
+ * Most of the time the flags are synonymous.  There is a short period of time 
+ * during resync that oos_may_write is clear but out_of_sync is not.  If a 
+ * codepath is called during that time and is sensitive to oos issues, it may 
+ * need to use the second flag.
+ */
+#define SHF_out_of_sync (1u<<30)
+#define SHF_oos_may_write (1u<<29)
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
+
+static inline int sh_page_has_multiple_shadows(struct page_info *pg)
+{
+    u32 shadows;
+    if ( !(pg->count_info & PGC_page_table) )
+        return 0;
+    shadows = pg->shadow_flags & SHF_page_type_mask;
+    /* More than one type bit set in shadow-flags? */
+    return ( (shadows & ~(1UL << find_first_set_bit(shadows))) != 0 );
+}
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+/* The caller must verify this is reasonable to call; i.e., valid mfn,
+ * domain is translated, &c */
+static inline int page_is_out_of_sync(struct page_info *p) 
+{
+    return (p->count_info & PGC_page_table)
+        && (p->shadow_flags & SHF_out_of_sync);
+}
+
+static inline int mfn_is_out_of_sync(mfn_t gmfn) 
+{
+    return page_is_out_of_sync(mfn_to_page(mfn_x(gmfn)));
+}
+
+static inline int page_oos_may_write(struct page_info *p) 
+{
+    return (p->count_info & PGC_page_table)
+        && (p->shadow_flags & SHF_oos_may_write);
+}
+
+static inline int mfn_oos_may_write(mfn_t gmfn) 
+{
+    return page_oos_may_write(mfn_to_page(mfn_x(gmfn)));
+}
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
 
 /******************************************************************************
  * Various function declarations 
@@ -351,7 +408,50 @@ int shadow_cmpxchg_guest_entry(struct vc
 int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
                                intpte_t *old, intpte_t new, mfn_t gmfn);
 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+/* Allow a shadowed page to go out of sync */
+int sh_unsync(struct vcpu *v, mfn_t gmfn, unsigned long va);
 
+/* Pull an out-of-sync page back into sync. */
+void sh_resync(struct vcpu *v, mfn_t gmfn);
+
+/* Pull all out-of-sync shadows back into sync.  If skip != 0, we try
+ * to avoid resyncing where we think we can get away with it. */
+
+void sh_resync_all(struct vcpu *v, int skip, int this, int others, int 
do_locking);
+
+static inline void
+shadow_resync_all(struct vcpu *v, int do_locking)
+{
+    sh_resync_all(v,
+                  0 /* skip */,
+                  1 /* this */,
+                  1 /* others */,
+                  do_locking);
+}
+
+static inline void
+shadow_resync_current_vcpu(struct vcpu *v, int do_locking)
+{
+    sh_resync_all(v,
+                  0 /* skip */,
+                  1 /* this */, 
+                  0 /* others */,
+                  do_locking);
+}
+
+static inline void
+shadow_sync_other_vcpus(struct vcpu *v, int do_locking)
+{
+    sh_resync_all(v,
+                  1 /* skip */, 
+                  0 /* this */,
+                  1 /* others */,
+                  do_locking);
+}
+
+void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn);
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
 
 /******************************************************************************
  * Flags used in the return value of the shadow_set_lXe() functions...
diff -r 26ecd1f9e128 xen/arch/x86/mm/shadow/types.h
--- a/xen/arch/x86/mm/shadow/types.h    Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/arch/x86/mm/shadow/types.h    Fri Jun 20 15:10:08 2008 +0100
@@ -438,6 +438,10 @@ struct shadow_walk_t
 #define sh_guess_wrmap             INTERNAL_NAME(sh_guess_wrmap)
 #define sh_clear_shadow_entry      INTERNAL_NAME(sh_clear_shadow_entry)
 
+#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
+#define sh_resync_l1               INTERNAL_NAME(sh_resync_l1)
+#define sh_safe_not_to_sync        INTERNAL_NAME(sh_safe_not_to_sync)
+#endif
 
 /* The sh_guest_(map|get)_* functions depends on Xen's paging levels */
 #define sh_guest_map_l1e \
diff -r 26ecd1f9e128 xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h      Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/include/asm-x86/domain.h      Fri Jun 20 15:10:08 2008 +0100
@@ -103,6 +103,9 @@ struct shadow_domain {
      * emulation and remove write permission
      */
     atomic_t          gtable_dirty_version;
+
+    /* OOS */
+    int oos_active;
 };
 
 struct shadow_vcpu {
@@ -122,6 +125,10 @@ struct shadow_vcpu {
     unsigned long last_emulated_frame;
     /* Last MFN that we emulated a write successfully */
     unsigned long last_emulated_mfn;
+
+    /* Shadow out-of-sync: pages that this vcpu has let go out of sync */
+    mfn_t oos[SHADOW_OOS_PAGES];
+    unsigned long oos_va[SHADOW_OOS_PAGES];
 };
 
 /************************************************/
diff -r 26ecd1f9e128 xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h  Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/include/asm-x86/mm.h  Fri Jun 20 15:10:08 2008 +0100
@@ -130,6 +130,9 @@ static inline u32 pickle_domptr(struct d
 /* The order of the largest allocation unit we use for shadow pages */
 #define SHADOW_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */
 
+/* The number of out-of-sync shadows we allow per vcpu (prime, please) */
+#define SHADOW_OOS_PAGES 7
+
 #define page_get_owner(_p)    (unpickle_domptr((_p)->u.inuse._domain))
 #define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d))
 
diff -r 26ecd1f9e128 xen/include/asm-x86/perfc_defn.h
--- a/xen/include/asm-x86/perfc_defn.h  Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/include/asm-x86/perfc_defn.h  Fri Jun 20 15:10:08 2008 +0100
@@ -80,6 +80,7 @@ PERFCOUNTER(shadow_writeable_h_3,  "shad
 PERFCOUNTER(shadow_writeable_h_3,  "shadow writeable: 64b w2k3")
 PERFCOUNTER(shadow_writeable_h_4,  "shadow writeable: linux low/solaris")
 PERFCOUNTER(shadow_writeable_h_5,  "shadow writeable: linux high")
+PERFCOUNTER(shadow_writeable_h_6,  "shadow writeable: unsync va")
 PERFCOUNTER(shadow_writeable_bf,   "shadow writeable brute-force")
 PERFCOUNTER(shadow_mappings,       "shadow removes all mappings")
 PERFCOUNTER(shadow_mappings_bf,    "shadow rm-mappings brute-force")
@@ -101,4 +102,8 @@ PERFCOUNTER(shadow_em_ex_non_pt,   "shad
 PERFCOUNTER(shadow_em_ex_non_pt,   "shadow extra non-pt-write op")
 PERFCOUNTER(shadow_em_ex_fail,     "shadow extra emulation failed")
 
+PERFCOUNTER(shadow_unsync,         "shadow OOS unsyncs")
+PERFCOUNTER(shadow_unsync_evict,   "shadow OOS evictions")
+PERFCOUNTER(shadow_resync,         "shadow OOS resyncs")
+
 /*#endif*/ /* __XEN_PERFC_DEFN_H__ */
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-devel] [PATCH 1/4] Out-of-sync L1 shadows: OOS base, Gianluca Guida <=