WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [RFC][PATCH] 3/9 Populate-on-demand memory: PoD core

To: "xen-devel@xxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxx>
Subject: [Xen-devel] [RFC][PATCH] 3/9 Populate-on-demand memory: PoD core
From: "George Dunlap" <George.Dunlap@xxxxxxxxxxxxx>
Date: Tue, 23 Dec 2008 13:45:52 +0000
Delivery-date: Tue, 23 Dec 2008 05:46:23 -0800
Dkim-signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=gamma; h=domainkey-signature:received:received:message-id:date:from:sender :to:subject:mime-version:content-type:content-transfer-encoding :content-disposition:x-google-sender-auth; bh=xQ+MWj3KfrEfjmngQeQHcQ/0wCai3Gc/BWiNVYbDzAg=; b=VHbYH5XA4C1DKehD+BHimGa1BmhvrTmyeb66EZyDOIROKBNm/uu0T9bHx+H8RuVTuq 8rBh/vYjKBBJCG9hyjsV8fnsHDSCWqQu9EOLXZVNILn3te5VpRBSBfN/8pIa0PvTYZN2 rpcIYFt+u1opx6hRVkCd8CZ/hg2qrJjhTZMjE=
Domainkey-signature: a=rsa-sha1; c=nofws; d=gmail.com; s=gamma; h=message-id:date:from:sender:to:subject:mime-version:content-type :content-transfer-encoding:content-disposition:x-google-sender-auth; b=dDeI+mbH8J5Oj3vPgxCbavGY50d/6A9hwGE35sAHkZ3VXfY1ZbhJHDUekE29YvaMyr RN2beDXieTzrlkyMzV7JsLGx5DlpeS/0vO3XkYw7PB5QHgk5CQngGOhQ+ONFdXxOZGNz YEK2YLFZDFR9RP9AVoiE3rB8R3EgGX8iqm4HU=
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
Core of populate-on-demand functionality:
* Introduce a populate-on-demand type
* Call p2m_demand_populate() when gfn_to_mfn() encounters PoD entries
* Return p2m memory to the domain list for freeing during domain destruction
* Audit p2m checks our PoD-entry reference-counting
* Add PoD information to the 'q' debug key

Signed-off-by: George Dunlap <george.dunlap@xxxxxxxxxxxxx>

diff -r e61e4075a710 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Wed Dec 17 12:59:51 2008 +0000
+++ b/xen/arch/x86/domain.c     Fri Dec 19 17:51:53 2008 +0000
@@ -147,6 +147,11 @@
                    _p(page_to_mfn(page)),
                    page->count_info, page->u.inuse.type_info);
         }
+    }
+
+    if ( is_hvm_domain(d) )
+    {
+        p2m_pod_dump_data(d);
     }

     list_for_each_entry ( page, &d->xenpage_list, list )
diff -r e61e4075a710 xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c     Wed Dec 17 12:59:51 2008 +0000
+++ b/xen/arch/x86/mm/p2m.c     Fri Dec 19 17:51:53 2008 +0000
@@ -118,9 +118,16 @@
         return flags;
     case p2m_mmio_direct:
         return flags | P2M_BASE_FLAGS | _PAGE_RW | _PAGE_PCD;
+    case p2m_populate_on_demand:
+        return flags;
     }
 }

+#if P2M_AUDIT
+static void audit_p2m(struct domain *d);
+#else
+# define audit_p2m(_d) do { (void)(_d); } while(0)
+#endif /* P2M_AUDIT */

 // Find the next level's P2M entry, checking for out-of-range gfn's...
 // Returns NULL on error.
@@ -162,7 +169,8 @@
                                       shift, max)) )
         return 0;

-    if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
+    /* PoD: Not present doesn't imply empty. */
+    if ( !l1e_get_flags(*p2m_entry) )
     {
         struct page_info *pg = d->arch.p2m->alloc_page(d);
         if ( pg == NULL )
@@ -197,7 +205,7 @@
         }
     }

-    ASSERT(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT);
+    ASSERT(l1e_get_flags(*p2m_entry) & (_PAGE_PRESENT|_PAGE_PSE));

     /* split single large page into 4KB page in P2M table */
     if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
@@ -240,6 +248,236 @@
     *table = next;

     return 1;
+}
+
+/*
+ * Populate-on-demand functionality
+ */
+int
+p2m_pod_cache_add(struct domain *d,
+                  struct page_info *page,
+                  unsigned long order)
+{
+    int i;
+    struct page_info *p;
+    struct p2m_domain *p2md = d->arch.p2m;
+
+#ifndef NDEBUG
+    mfn_t mfn;
+
+    mfn = page_to_mfn(page);
+
+    /* Check to make sure this is a contiguous region */
+    if( mfn_x(mfn) & ((1 << order) - 1) )
+    {
+        printk("%s: mfn %lx not aligned order %lu! (mask %lx)\n",
+               __func__, mfn_x(mfn), order, ((1UL << order) - 1));
+        return -1;
+    }
+
+    for(i=0; i < 1 << order ; i++) {
+        struct domain * od;
+
+        p = mfn_to_page(_mfn(mfn_x(mfn) + i));
+        od = page_get_owner(p);
+        if(od != d)
+        {
+            printk("%s: mfn %lx expected owner d%d, got owner d%d!\n",
+                   __func__, mfn_x(mfn), d->domain_id,
+                   od?od->domain_id:-1);
+            return -1;
+        }
+    }
+#endif
+
+    spin_lock(&d->page_alloc_lock);
+
+    /* First, take all pages off the domain list */
+    for(i=0; i < 1 << order ; i++)
+    {
+        p = page + i;
+        list_del(&p->list);
+    }
+
+    /* Then add the first one to the appropriate populate-on-demand list */
+    switch(order)
+    {
+    case 9:
+        list_add_tail(&page->list, &p2md->pod.super); /* lock: page_alloc */
+        p2md->pod.count += 1 << order;
+        break;
+    case 0:
+        list_add_tail(&page->list, &p2md->pod.single); /* lock: page_alloc */
+        p2md->pod.count += 1 ;
+        break;
+    default:
+        BUG();
+    }
+
+    spin_unlock(&d->page_alloc_lock);
+
+    return 0;
+}
+
+void
+p2m_pod_empty_cache(struct domain *d)
+{
+    struct p2m_domain *p2md = d->arch.p2m;
+    struct list_head *q, *p;
+
+    spin_lock(&d->page_alloc_lock);
+
+    list_for_each_safe(p, q, &p2md->pod.super) /* lock: page_alloc */
+    {
+        int i;
+        struct page_info *page;
+
+        list_del(p);
+
+        page = list_entry(p, struct page_info, list);
+
+        for ( i = 0 ; i < (1 << 9) ; i++ )
+        {
+            BUG_ON(page_get_owner(page + i) != d);
+            list_add_tail(&page[i].list, &d->page_list);
+        }
+
+        p2md->pod.count -= 1<<9;
+    }
+
+    list_for_each_safe(p, q, &p2md->pod.single)
+    {
+        struct page_info *page;
+
+        list_del(p);
+
+        page = list_entry(p, struct page_info, list);
+
+        BUG_ON(page_get_owner(page) != d);
+        list_add_tail(&page->list, &d->page_list);
+
+        p2md->pod.count -= 1;
+    }
+
+    BUG_ON(p2md->pod.count != 0);
+
+    spin_unlock(&d->page_alloc_lock);
+}
+
+void
+p2m_pod_dump_data(struct domain *d)
+{
+    struct p2m_domain *p2md = d->arch.p2m;
+
+    printk("    PoD entries=%d cachesize=%d\n",
+           p2md->pod.entry_count, p2md->pod.count);
+}
+
+static int
+p2m_pod_demand_populate(struct domain *d, unsigned long gfn,
+                        mfn_t table_mfn,
+                        l1_pgentry_t *p2m_entry,
+                        unsigned int order,
+                        p2m_query_t q)
+{
+    struct page_info *p = NULL; /* Compiler warnings */
+    unsigned long gfn_aligned;
+    mfn_t mfn;
+    l1_pgentry_t entry_content = l1e_empty();
+    struct p2m_domain *p2md = d->arch.p2m;
+    int i;
+
+    /* We need to grab the p2m lock here and re-check the entry to make
+     * sure that someone else hasn't populated it for us, then hold it
+     * until we're done. */
+    p2m_lock(p2md);
+    audit_p2m(d);
+
+    /* Check to make sure this is still PoD */
+    if ( p2m_flags_to_type(l1e_get_flags(*p2m_entry)) !=
p2m_populate_on_demand )
+    {
+        p2m_unlock(p2md);
+        return 0;
+    }
+
+    spin_lock(&d->page_alloc_lock);
+
+    if ( p2md->pod.count == 0 )
+        goto out_of_memory;
+
+    /* FIXME -- use single pages / splinter superpages if need be */
+    switch ( order )
+    {
+    case 9:
+        BUG_ON( list_empty(&p2md->pod.super) );
+        p = list_entry(p2md->pod.super.next, struct page_info, list);
+        p2md->pod.count -= 1 << order; /* Lock: page_alloc */
+        break;
+    case 0:
+        BUG_ON( list_empty(&p2md->pod.single) );
+        p = list_entry(p2md->pod.single.next, struct page_info, list);
+        p2md->pod.count -= 1;
+        break;
+    default:
+        BUG();
+    }
+
+    list_del(&p->list);
+
+    mfn = page_to_mfn(p);
+
+    BUG_ON((mfn_x(mfn) & ((1 << order)-1)) != 0);
+
+    /* Put the pages back on the domain page_list */
+    for ( i = 0 ; i < (1 << order) ; i++ )
+    {
+        BUG_ON(page_get_owner(p + i) != d);
+        list_add_tail(&p[i].list, &d->page_list);
+    }
+
+    spin_unlock(&d->page_alloc_lock);
+
+    /* Fill in the entry in the p2m */
+    switch ( order )
+    {
+    case 9:
+    {
+        l2_pgentry_t l2e_content;
+
+        l2e_content = l2e_from_pfn(mfn_x(mfn),
+                                   p2m_type_to_flags(p2m_ram_rw) | _PAGE_PSE);
+
+        entry_content.l1 = l2e_content.l2;
+    }
+    break;
+    case 0:
+        entry_content = l1e_from_pfn(mfn_x(mfn),
+                                     p2m_type_to_flags(p2m_ram_rw));
+        break;
+
+    }
+
+    gfn_aligned = (gfn >> order) << order;
+
+    paging_write_p2m_entry(d, gfn_aligned, p2m_entry, table_mfn,
+                           entry_content, (order==9)?2:1);
+
+    for( i = 0 ; i < (1UL << order) ; i++ )
+        set_gpfn_from_mfn(mfn_x(mfn) + i, gfn_aligned + i);
+
+    p2md->pod.entry_count -= (1 << order); /* Lock: p2m */
+    BUG_ON(p2md->pod.entry_count < 0);
+    audit_p2m(d);
+    p2m_unlock(p2md);
+
+    return 0;
+out_of_memory:
+    spin_unlock(&d->page_alloc_lock);
+    audit_p2m(d);
+    p2m_unlock(p2md);
+    printk("%s: Out of populate-on-demand memory!\n", __func__);
+    domain_crash(d);
+    return -1;
 }

 // Returns 0 on error (out of memory)
@@ -303,6 +541,7 @@
                                    L2_PAGETABLE_ENTRIES);
         ASSERT(p2m_entry);

+        /* FIXME: Deal with 4k replaced by 2meg pages */
         if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
              !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
         {
@@ -311,7 +550,7 @@
             goto out;
         }

-        if ( mfn_valid(mfn) )
+        if ( mfn_valid(mfn) || p2m_is_magic(p2mt) )
             l2e_content = l2e_from_pfn(mfn_x(mfn),
                                        p2m_type_to_flags(p2mt) | _PAGE_PSE);
         else
@@ -403,8 +642,21 @@

     l2e = map_domain_page(mfn_x(mfn));
     l2e += l2_table_offset(addr);
+
+pod_retry_l2:
     if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
     {
+        /* PoD: Try to populate a 2-meg chunk */
+        if ( p2m_flags_to_type(l2e_get_flags(*l2e)) == p2m_populate_on_demand )
+        {
+            if ( q != p2m_query ) {
+                if( !p2m_pod_demand_populate(d, gfn, mfn,
+                                             (l1_pgentry_t *)l2e, 9, q) )
+                    goto pod_retry_l2;
+            } else
+                *t = p2m_populate_on_demand;
+        }
+
         unmap_domain_page(l2e);
         return _mfn(INVALID_MFN);
     }
@@ -423,8 +675,20 @@

     l1e = map_domain_page(mfn_x(mfn));
     l1e += l1_table_offset(addr);
+pod_retry_l1:
     if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
     {
+        /* PoD: Try to populate */
+        if ( p2m_flags_to_type(l1e_get_flags(*l1e)) == p2m_populate_on_demand )
+        {
+            if ( q != p2m_query ) {
+                if( !p2m_pod_demand_populate(d, gfn, mfn,
+                                             (l1_pgentry_t *)l1e, 0, q) )
+                    goto pod_retry_l1;
+            } else
+                *t = p2m_populate_on_demand;
+        }
+
         unmap_domain_page(l1e);
         return _mfn(INVALID_MFN);
     }
@@ -450,48 +714,114 @@

     if ( gfn <= current->domain->arch.p2m->max_mapped_pfn )
     {
-        l1_pgentry_t l1e = l1e_empty();
+        l1_pgentry_t l1e = l1e_empty(), *p2m_entry;
         l2_pgentry_t l2e = l2e_empty();
         int ret;

         ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START)
                / sizeof(l1_pgentry_t));

+        /*
+         * Read & process L2
+         */
+        p2m_entry = &__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START)
+                                       + l2_linear_offset(addr)];
+
+    pod_retry_l2:
         ret = __copy_from_user(&l2e,
-
&__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START) +
l2_linear_offset(addr)],
+                               p2m_entry,
                                sizeof(l2e));
+        if ( ret != 0
+             || !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
+        {
+            if( (l2e_get_flags(l2e) & _PAGE_PSE)
+                && ( p2m_flags_to_type(l2e_get_flags(l2e))
+                     == p2m_populate_on_demand ) )
+            {
+                /* The read has succeeded, so we know that the mapping
+                 * exits at this point.  */
+                if ( q != p2m_query )
+                {
+                    if( !p2m_pod_demand_populate(current->domain, gfn, mfn,
+                                                 p2m_entry, 9, q) )
+                        goto pod_retry_l2;
+
+                    /* Allocate failed. */
+                    p2mt = p2m_invalid;
+                    printk("%s: Allocate failed!\n", __func__);
+                    goto out;
+                }
+                else
+                {
+                    p2mt = p2m_populate_on_demand;
+                    goto out;
+                }
+            }
+
+            goto pod_retry_l1;
+        }

-        if ( (ret == 0) && (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
-             (l2e_get_flags(l2e) & _PAGE_PSE) )
+        if (l2e_get_flags(l2e) & _PAGE_PSE)
         {
             p2mt = p2m_flags_to_type(l2e_get_flags(l2e));
             ASSERT(l2e_get_pfn(l2e) != INVALID_MFN || !p2m_is_ram(p2mt));
+
             if ( p2m_is_valid(p2mt) )
                 mfn = _mfn(l2e_get_pfn(l2e) + l1_table_offset(addr));
             else
                 p2mt = p2m_mmio_dm;
+
+            goto out;
         }
-        else
-        {
-
-            /* Need to __copy_from_user because the p2m is sparse and this
-             * part might not exist */
-            ret = __copy_from_user(&l1e,
-                                   &phys_to_machine_mapping[gfn],
-                                   sizeof(l1e));
+
+        /*
+         * Read and process L1
+         */
+
+        /* Need to __copy_from_user because the p2m is sparse and this
+         * part might not exist */
+    pod_retry_l1:
+        p2m_entry = &phys_to_machine_mapping[gfn];
+
+        ret = __copy_from_user(&l1e,
+                               p2m_entry,
+                               sizeof(l1e));

-            if ( ret == 0 ) {
-                p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
-                ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
-                if ( p2m_is_valid(p2mt) )
-                    mfn = _mfn(l1e_get_pfn(l1e));
-                else
-                    /* XXX see above */
-                    p2mt = p2m_mmio_dm;
+        if ( ret == 0 ) {
+            p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
+            ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
+
+            if ( p2m_flags_to_type(l1e_get_flags(l1e))
+                 == p2m_populate_on_demand )
+            {
+                /* The read has succeeded, so we know that the mapping
+                 * exits at this point.  */
+                if ( q != p2m_query )
+                {
+                    if( !p2m_pod_demand_populate(current->domain, gfn, mfn,
+                                                 (l1_pgentry_t *)p2m_entry, 0,
+                                                 q) )
+                        goto pod_retry_l1;
+
+                    /* Allocate failed. */
+                    p2mt = p2m_invalid;
+                    goto out;
+                }
+                else
+                {
+                    p2mt = p2m_populate_on_demand;
+                    goto out;
+                }
             }
+
+            if ( p2m_is_valid(p2mt) )
+                mfn = _mfn(l1e_get_pfn(l1e));
+            else
+                /* XXX see above */
+                p2mt = p2m_mmio_dm;
         }
     }
-
+out:
     *t = p2mt;
     return mfn;
 }
@@ -510,6 +840,8 @@
     memset(p2m, 0, sizeof(*p2m));
     p2m_lock_init(p2m);
     INIT_LIST_HEAD(&p2m->pages);
+    INIT_LIST_HEAD(&p2m->pod.super);
+    INIT_LIST_HEAD(&p2m->pod.single);

     p2m->set_entry = p2m_set_entry;
     p2m->get_entry = p2m_gfn_to_mfn;
@@ -680,6 +1012,7 @@
     struct page_info *page;
     struct domain *od;
     unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
+    int entry_count = 0;
     mfn_t p2mfn;
     unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
     int test_linear;
@@ -809,6 +1142,10 @@
                 {
                     if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
                     {
+                        if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE)
+                             && ( p2m_flags_to_type(l2e_get_flags(l2e[i2]))
+                                  == p2m_populate_on_demand ) )
+                            entry_count+=(1<<9);
                         gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
                         continue;
                     }
@@ -839,13 +1176,20 @@
                     for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
                     {
                         if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
+                        {
+                            if ( p2m_flags_to_type(l1e_get_flags(l1e[i1]))
+                                 == p2m_populate_on_demand )
+                            entry_count++;
                             continue;
+                        }
                         mfn = l1e_get_pfn(l1e[i1]);
                         ASSERT(mfn_valid(_mfn(mfn)));
                         m2pfn = get_gpfn_from_mfn(mfn);
                         if ( m2pfn != gfn )
                         {
                             pmbad++;
+                            printk("mismatch: gfn %#lx -> mfn %#lx"
+                                   " -> gfn %#lx\n", gfn, mfn, m2pfn);
                             P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
                                        " -> gfn %#lx\n", gfn, mfn, m2pfn);
                             BUG();
@@ -868,6 +1212,15 @@

     }

+    if ( entry_count != d->arch.p2m->pod.entry_count )
+    {
+        printk("%s: refcounted entry count %d, audit count %d!\n",
+               __func__,
+               d->arch.p2m->pod.entry_count,
+               entry_count);
+        BUG();
+    }
+
     //P2M_PRINTK("p2m audit complete\n");
     //if ( orphans_i | orphans_d | mpbad | pmbad )
     //    P2M_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
@@ -876,8 +1229,6 @@
         P2M_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
                    pmbad, mpbad);
 }
-#else
-#define audit_p2m(_d) do { (void)(_d); } while(0)
 #endif /* P2M_AUDIT */


@@ -915,6 +1266,77 @@
 }

 int
+guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
+                                      unsigned int order)
+{
+    struct p2m_domain *p2md = d->arch.p2m;
+    unsigned long i;
+    p2m_type_t ot;
+    mfn_t omfn;
+    int pod_count = 0;
+    int rc = 0;
+
+    BUG_ON(!paging_mode_translate(d));
+
+#if CONFIG_PAGING_LEVELS == 3
+    /*
+     * 32bit PAE nested paging does not support over 4GB guest due to
+     * hardware translation limit. This limitation is checked by comparing
+     * gfn with 0xfffffUL.
+     */
+    if ( paging_mode_hap(d) && (gfn > 0xfffffUL) )
+    {
+        if ( !test_and_set_bool(d->arch.hvm_domain.svm.npt_4gb_warning) )
+            dprintk(XENLOG_WARNING, "Dom%d failed to populate memory beyond"
+                    " 4GB: specify 'hap=0' domain config option.\n",
+                    d->domain_id);
+        return -EINVAL;
+    }
+#endif
+
+    p2m_lock(p2md);
+    audit_p2m(d);
+
+    P2M_DEBUG("adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
+
+    /* Make sure all gpfns are unused */
+    for ( i = 0; i < (1UL << order); i++ )
+    {
+        omfn = gfn_to_mfn_query(d, gfn + i, &ot);
+        if ( p2m_is_ram(ot) )
+        {
+            printk("%s: gfn_to_mfn returned type %d!\n",
+                   __func__, ot);
+            rc = -EBUSY;
+            goto out;
+        }
+        else if ( ot == p2m_populate_on_demand )
+        {
+            /* Count how man PoD entries we'll be replacing if successful */
+            pod_count++;
+        }
+    }
+
+    /* Now, actually do the two-way mapping */
+    if ( !set_p2m_entry(d, gfn, _mfn(POPULATE_ON_DEMAND_MFN), order,
+                        p2m_populate_on_demand) )
+        rc = -EINVAL;
+    else
+    {
+        p2md->pod.entry_count += 1 << order; /* Lock: p2m */
+        p2md->pod.entry_count -= pod_count;
+        BUG_ON(p2md->pod.entry_count < 0);
+    }
+
+    audit_p2m(d);
+    p2m_unlock(p2md);
+
+out:
+    return rc;
+
+}
+
+int
 guest_physmap_add_entry(struct domain *d, unsigned long gfn,
                         unsigned long mfn, unsigned int page_order,
                         p2m_type_t t)
@@ -922,6 +1344,7 @@
     unsigned long i, ogfn;
     p2m_type_t ot;
     mfn_t omfn;
+    int pod_count = 0;
     int rc = 0;

     if ( !paging_mode_translate(d) )
@@ -969,6 +1392,11 @@
             ASSERT(mfn_valid(omfn));
             set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
         }
+        else if ( ot == p2m_populate_on_demand )
+        {
+            /* Count how man PoD entries we'll be replacing if successful */
+            pod_count++;
+        }
     }

     /* Then, look for m->p mappings for this range and deal with them */
@@ -1015,6 +1443,11 @@
         if ( !set_p2m_entry(d, gfn, _mfn(INVALID_MFN), page_order,
                             p2m_invalid) )
             rc = -EINVAL;
+        else
+        {
+            d->arch.p2m->pod.entry_count -= pod_count; /* Lock: p2m */
+            BUG_ON(d->arch.p2m->pod.entry_count < 0);
+        }
     }

     audit_p2m(d);
diff -r e61e4075a710 xen/arch/x86/mm/paging.c
--- a/xen/arch/x86/mm/paging.c  Wed Dec 17 12:59:51 2008 +0000
+++ b/xen/arch/x86/mm/paging.c  Fri Dec 19 17:51:53 2008 +0000
@@ -585,6 +585,9 @@

     /* clean up log dirty resources. */
     paging_log_dirty_teardown(d);
+
+    /* Move populate-on-demand cache back to domain_list for destruction */
+    p2m_pod_empty_cache(d);
 }

 /* Call once all of the references to the domain have gone away */
diff -r e61e4075a710 xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c    Wed Dec 17 12:59:51 2008 +0000
+++ b/xen/arch/x86/mm/shadow/multi.c    Fri Dec 19 17:51:53 2008 +0000
@@ -2173,7 +2173,7 @@
         mfn_t gl3mfn = gfn_to_mfn_query(d, gl3gfn, &p2mt);
         if ( p2m_is_ram(p2mt) )
             sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
-        else
+        else if ( p2mt != p2m_populate_on_demand )
             result |= SHADOW_SET_ERROR;

 #if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
@@ -2230,7 +2230,7 @@
         mfn_t gl2mfn = gfn_to_mfn_query(v->domain, gl2gfn, &p2mt);
         if ( p2m_is_ram(p2mt) )
             sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
-        else
+        else if ( p2mt != p2m_populate_on_demand )
             result |= SHADOW_SET_ERROR;

 #if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
@@ -2278,8 +2278,8 @@
         {
             mfn_t gl1mfn = gfn_to_mfn_query(v->domain, gl1gfn, &p2mt);
             if ( p2m_is_ram(p2mt) )
-                sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
-            else
+                sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
+            else if ( p2mt != p2m_populate_on_demand )
                 result |= SHADOW_SET_ERROR;
         }
     }
diff -r e61e4075a710 xen/include/asm-x86/p2m.h
--- a/xen/include/asm-x86/p2m.h Wed Dec 17 12:59:51 2008 +0000
+++ b/xen/include/asm-x86/p2m.h Fri Dec 19 17:51:53 2008 +0000
@@ -64,6 +64,7 @@
     p2m_ram_ro = 3,             /* Read-only; writes are silently dropped */
     p2m_mmio_dm = 4,            /* Reads and write go to the device model */
     p2m_mmio_direct = 5,        /* Read/write mapping of genuine MMIO area */
+    p2m_populate_on_demand = 6, /* Place-holder for empty memory */
 } p2m_type_t;

 typedef enum {
@@ -88,11 +89,19 @@
 #define P2M_RO_TYPES (p2m_to_mask(p2m_ram_logdirty)     \
                       | p2m_to_mask(p2m_ram_ro))

+#define P2M_MAGIC_TYPES (p2m_to_mask(p2m_populate_on_demand))
+
 /* Useful predicates */
 #define p2m_is_ram(_t) (p2m_to_mask(_t) & P2M_RAM_TYPES)
 #define p2m_is_mmio(_t) (p2m_to_mask(_t) & P2M_MMIO_TYPES)
 #define p2m_is_readonly(_t) (p2m_to_mask(_t) & P2M_RO_TYPES)
+#define p2m_is_magic(_t) (p2m_to_mask(_t) & P2M_MAGIC_TYPES)
 #define p2m_is_valid(_t) (p2m_to_mask(_t) & (P2M_RAM_TYPES | P2M_MMIO_TYPES))
+
+/* Populate-on-demand */
+#define POPULATE_ON_DEMAND_MFN  (1<<9)
+#define POD_PAGE_ORDER 9
+

 struct p2m_domain {
     /* Lock that protects updates to the p2m */
@@ -122,6 +131,28 @@

     /* Highest guest frame that's ever been mapped in the p2m */
     unsigned long max_mapped_pfn;
+
+    /* Populate-on-demand variables
+     * NB on locking.  {super,single,count} are
+     * covered by d->page_alloc_lock, since they're almost always used in
+     * conjunction with that functionality.  {entry_count} is covered by
+     * the domain p2m lock, since it's almost always used in conjunction
+     * with changing the p2m tables.
+     *
+     * At this point, both locks are held in two places.  In both,
+     * the order is [p2m,page_alloc]:
+     * + p2m_pod_decrease_reservation() calls p2m_pod_cache_add(),
+     *   which grabs page_alloc
+     * + p2m_pod_demand_populate() grabs both; the p2m lock to avoid
+     *   double-demand-populating of pages, the page_alloc lock to
+     *   protect moving stuff from the PoD cache to the domain page list.
+     */
+    struct {
+        struct list_head super,        /* List of superpages                */
+                         single;       /* Non-super lists                   */
+        int              count,        /* # of pages in cache lists         */
+                         entry_count;  /* # of pages in p2m marked pod      */
+    } pod;
 };

 /* Extract the type from the PTE flags that store it */
@@ -220,10 +251,21 @@
 void p2m_teardown(struct domain *d);
 void p2m_final_teardown(struct domain *d);

+/* Dump PoD information about the domain */
+void p2m_pod_dump_data(struct domain *d);
+
+/* Move all pages from the populate-on-demand cache to the domain page_list
+ * (usually in preparation for domain destruction) */
+void p2m_pod_empty_cache(struct domain *d);
+
 /* Add a page to a domain's p2m table */
 int guest_physmap_add_entry(struct domain *d, unsigned long gfn,
                             unsigned long mfn, unsigned int page_order,
                             p2m_type_t t);
+
+/* Set a p2m range as populate-on-demand */
+int guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
+                                          unsigned int order);

 /* Untyped version for RAM only, for compatibility
  *

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-devel] [RFC][PATCH] 3/9 Populate-on-demand memory: PoD core, George Dunlap <=