[Xen-devel] [RFC][PATCH] 3/9 Populate-on-demand memory: PoD core

Core of populate-on-demand functionality:
* Introduce a populate-on-demand type
* Call p2m_demand_populate() when gfn_to_mfn() encounters PoD entries
* Return p2m memory to the domain list for freeing during domain destruction
* Audit p2m checks our PoD-entry reference-counting
* Add PoD information to the 'q' debug key

Signed-off-by: George Dunlap <george.dunlap@xxxxxxxxxxxxx>

diff -r e61e4075a710 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Wed Dec 17 12:59:51 2008 +0000
+++ b/xen/arch/x86/domain.c     Fri Dec 19 17:51:53 2008 +0000
@@ -147,6 +147,11 @@
                    _p(page_to_mfn(page)),
                    page->count_info, page->u.inuse.type_info);
         }
+    }
+
+    if ( is_hvm_domain(d) )
+    {
+        p2m_pod_dump_data(d);
     }

     list_for_each_entry ( page, &d->xenpage_list, list )
diff -r e61e4075a710 xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c     Wed Dec 17 12:59:51 2008 +0000
+++ b/xen/arch/x86/mm/p2m.c     Fri Dec 19 17:51:53 2008 +0000
@@ -118,9 +118,16 @@
         return flags;
     case p2m_mmio_direct:
         return flags | P2M_BASE_FLAGS | _PAGE_RW | _PAGE_PCD;
+    case p2m_populate_on_demand:
+        return flags;
     }
 }

+#if P2M_AUDIT
+static void audit_p2m(struct domain *d);
+#else
+# define audit_p2m(_d) do { (void)(_d); } while(0)
+#endif /* P2M_AUDIT */

 // Find the next level's P2M entry, checking for out-of-range gfn's...
 // Returns NULL on error.
@@ -162,7 +169,8 @@
                                       shift, max)) )
         return 0;

-    if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
+    /* PoD: Not present doesn't imply empty. */
+    if ( !l1e_get_flags(*p2m_entry) )
     {
         struct page_info *pg = d->arch.p2m->alloc_page(d);
         if ( pg == NULL )
@@ -197,7 +205,7 @@
         }
     }

-    ASSERT(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT);
+    ASSERT(l1e_get_flags(*p2m_entry) & (_PAGE_PRESENT|_PAGE_PSE));

     /* split single large page into 4KB page in P2M table */
     if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
@@ -240,6 +248,236 @@
     *table = next;

     return 1;
+}
+
+/*
+ * Populate-on-demand functionality
+ */
+int
+p2m_pod_cache_add(struct domain *d,
+                  struct page_info *page,
+                  unsigned long order)
+{
+    int i;
+    struct page_info *p;
+    struct p2m_domain *p2md = d->arch.p2m;
+
+#ifndef NDEBUG
+    mfn_t mfn;
+
+    mfn = page_to_mfn(page);
+
+    /* Check to make sure this is a contiguous region */
+    if( mfn_x(mfn) & ((1 << order) - 1) )
+    {
+        printk("%s: mfn %lx not aligned order %lu! (mask %lx)\n",
+               __func__, mfn_x(mfn), order, ((1UL << order) - 1));
+        return -1;
+    }
+
+    for(i=0; i < 1 << order ; i++) {
+        struct domain * od;
+
+        p = mfn_to_page(_mfn(mfn_x(mfn) + i));
+        od = page_get_owner(p);
+        if(od != d)
+        {
+            printk("%s: mfn %lx expected owner d%d, got owner d%d!\n",
+                   __func__, mfn_x(mfn), d->domain_id,
+                   od?od->domain_id:-1);
+            return -1;
+        }
+    }
+#endif
+
+    spin_lock(&d->page_alloc_lock);
+
+    /* First, take all pages off the domain list */
+    for(i=0; i < 1 << order ; i++)
+    {
+        p = page + i;
+        list_del(&p->list);
+    }
+
+    /* Then add the first one to the appropriate populate-on-demand list */
+    switch(order)
+    {
+    case 9:
+        list_add_tail(&page->list, &p2md->pod.super); /* lock: page_alloc */
+        p2md->pod.count += 1 << order;
+        break;
+    case 0:
+        list_add_tail(&page->list, &p2md->pod.single); /* lock: page_alloc */
+        p2md->pod.count += 1 ;
+        break;
+    default:
+        BUG();
+    }
+
+    spin_unlock(&d->page_alloc_lock);
+
+    return 0;
+}
+
+void
+p2m_pod_empty_cache(struct domain *d)
+{
+    struct p2m_domain *p2md = d->arch.p2m;
+    struct list_head *q, *p;
+
+    spin_lock(&d->page_alloc_lock);
+
+    list_for_each_safe(p, q, &p2md->pod.super) /* lock: page_alloc */
+    {
+        int i;
+        struct page_info *page;
+
+        list_del(p);
+
+        page = list_entry(p, struct page_info, list);
+
+        for ( i = 0 ; i < (1 << 9) ; i++ )
+        {
+            BUG_ON(page_get_owner(page + i) != d);
+            list_add_tail(&page[i].list, &d->page_list);
+        }
+
+        p2md->pod.count -= 1<<9;
+    }
+
+    list_for_each_safe(p, q, &p2md->pod.single)
+    {
+        struct page_info *page;
+
+        list_del(p);
+
+        page = list_entry(p, struct page_info, list);
+
+        BUG_ON(page_get_owner(page) != d);
+        list_add_tail(&page->list, &d->page_list);
+
+        p2md->pod.count -= 1;
+    }
+
+    BUG_ON(p2md->pod.count != 0);
+
+    spin_unlock(&d->page_alloc_lock);
+}
+
+void
+p2m_pod_dump_data(struct domain *d)
+{
+    struct p2m_domain *p2md = d->arch.p2m;
+
+    printk("    PoD entries=%d cachesize=%d\n",
+           p2md->pod.entry_count, p2md->pod.count);
+}
+
+static int
+p2m_pod_demand_populate(struct domain *d, unsigned long gfn,
+                        mfn_t table_mfn,
+                        l1_pgentry_t *p2m_entry,
+                        unsigned int order,
+                        p2m_query_t q)
+{
+    struct page_info *p = NULL; /* Compiler warnings */
+    unsigned long gfn_aligned;
+    mfn_t mfn;
+    l1_pgentry_t entry_content = l1e_empty();
+    struct p2m_domain *p2md = d->arch.p2m;
+    int i;
+
+    /* We need to grab the p2m lock here and re-check the entry to make
+     * sure that someone else hasn't populated it for us, then hold it
+     * until we're done. */
+    p2m_lock(p2md);
+    audit_p2m(d);
+
+    /* Check to make sure this is still PoD */
+    if ( p2m_flags_to_type(l1e_get_flags(*p2m_entry)) !=
p2m_populate_on_demand )
+    {
+        p2m_unlock(p2md);
+        return 0;
+    }
+
+    spin_lock(&d->page_alloc_lock);
+
+    if ( p2md->pod.count == 0 )
+        goto out_of_memory;
+
+    /* FIXME -- use single pages / splinter superpages if need be */
+    switch ( order )
+    {
+    case 9:
+        BUG_ON( list_empty(&p2md->pod.super) );
+        p = list_entry(p2md->pod.super.next, struct page_info, list);
+        p2md->pod.count -= 1 << order; /* Lock: page_alloc */
+        break;
+    case 0:
+        BUG_ON( list_empty(&p2md->pod.single) );
+        p = list_entry(p2md->pod.single.next, struct page_info, list);
+        p2md->pod.count -= 1;
+        break;
+    default:
+        BUG();
+    }
+
+    list_del(&p->list);
+
+    mfn = page_to_mfn(p);
+
+    BUG_ON((mfn_x(mfn) & ((1 << order)-1)) != 0);
+
+    /* Put the pages back on the domain page_list */
+    for ( i = 0 ; i < (1 << order) ; i++ )
+    {
+        BUG_ON(page_get_owner(p + i) != d);
+        list_add_tail(&p[i].list, &d->page_list);
+    }
+
+    spin_unlock(&d->page_alloc_lock);
+
+    /* Fill in the entry in the p2m */
+    switch ( order )
+    {
+    case 9:
+    {
+        l2_pgentry_t l2e_content;
+
+        l2e_content = l2e_from_pfn(mfn_x(mfn),
+                                   p2m_type_to_flags(p2m_ram_rw) | _PAGE_PSE);
+
+        entry_content.l1 = l2e_content.l2;
+    }
+    break;
+    case 0:
+        entry_content = l1e_from_pfn(mfn_x(mfn),
+                                     p2m_type_to_flags(p2m_ram_rw));
+        break;
+
+    }
+
+    gfn_aligned = (gfn >> order) << order;
+
+    paging_write_p2m_entry(d, gfn_aligned, p2m_entry, table_mfn,
+                           entry_content, (order==9)?2:1);
+
+    for( i = 0 ; i < (1UL << order) ; i++ )
+        set_gpfn_from_mfn(mfn_x(mfn) + i, gfn_aligned + i);
+
+    p2md->pod.entry_count -= (1 << order); /* Lock: p2m */
+    BUG_ON(p2md->pod.entry_count < 0);
+    audit_p2m(d);
+    p2m_unlock(p2md);
+
+    return 0;
+out_of_memory:
+    spin_unlock(&d->page_alloc_lock);
+    audit_p2m(d);
+    p2m_unlock(p2md);
+    printk("%s: Out of populate-on-demand memory!\n", __func__);
+    domain_crash(d);
+    return -1;
 }

 // Returns 0 on error (out of memory)
@@ -303,6 +541,7 @@
                                    L2_PAGETABLE_ENTRIES);
         ASSERT(p2m_entry);

+        /* FIXME: Deal with 4k replaced by 2meg pages */
         if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
              !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
         {
@@ -311,7 +550,7 @@
             goto out;
         }

-        if ( mfn_valid(mfn) )
+        if ( mfn_valid(mfn) || p2m_is_magic(p2mt) )
             l2e_content = l2e_from_pfn(mfn_x(mfn),
                                        p2m_type_to_flags(p2mt) | _PAGE_PSE);
         else
@@ -403,8 +642,21 @@

     l2e = map_domain_page(mfn_x(mfn));
     l2e += l2_table_offset(addr);
+
+pod_retry_l2:
     if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
     {
+        /* PoD: Try to populate a 2-meg chunk */
+        if ( p2m_flags_to_type(l2e_get_flags(*l2e)) == p2m_populate_on_demand )
+        {
+            if ( q != p2m_query ) {
+                if( !p2m_pod_demand_populate(d, gfn, mfn,
+                                             (l1_pgentry_t *)l2e, 9, q) )
+                    goto pod_retry_l2;
+            } else
+                *t = p2m_populate_on_demand;
+        }
+
         unmap_domain_page(l2e);
         return _mfn(INVALID_MFN);
     }
@@ -423,8 +675,20 @@

     l1e = map_domain_page(mfn_x(mfn));
     l1e += l1_table_offset(addr);
+pod_retry_l1:
     if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
     {
+        /* PoD: Try to populate */
+        if ( p2m_flags_to_type(l1e_get_flags(*l1e)) == p2m_populate_on_demand )
+        {
+            if ( q != p2m_query ) {
+                if( !p2m_pod_demand_populate(d, gfn, mfn,
+                                             (l1_pgentry_t *)l1e, 0, q) )
+                    goto pod_retry_l1;
+            } else
+                *t = p2m_populate_on_demand;
+        }
+
         unmap_domain_page(l1e);
         return _mfn(INVALID_MFN);
     }
@@ -450,48 +714,114 @@

     if ( gfn <= current->domain->arch.p2m->max_mapped_pfn )
     {
-        l1_pgentry_t l1e = l1e_empty();
+        l1_pgentry_t l1e = l1e_empty(), *p2m_entry;
         l2_pgentry_t l2e = l2e_empty();
         int ret;

         ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START)
                / sizeof(l1_pgentry_t));

+        /*
+         * Read & process L2
+         */
+        p2m_entry = &__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START)
+                                       + l2_linear_offset(addr)];
+
+    pod_retry_l2:
         ret = __copy_from_user(&l2e,
-
&__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START) +
l2_linear_offset(addr)],
+                               p2m_entry,
                                sizeof(l2e));
+        if ( ret != 0
+             || !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
+        {
+            if( (l2e_get_flags(l2e) & _PAGE_PSE)
+                && ( p2m_flags_to_type(l2e_get_flags(l2e))
+                     == p2m_populate_on_demand ) )
+            {
+                /* The read has succeeded, so we know that the mapping
+                 * exits at this point.  */
+                if ( q != p2m_query )
+                {
+                    if( !p2m_pod_demand_populate(current->domain, gfn, mfn,
+                                                 p2m_entry, 9, q) )
+                        goto pod_retry_l2;
+
+                    /* Allocate failed. */
+                    p2mt = p2m_invalid;
+                    printk("%s: Allocate failed!\n", __func__);
+                    goto out;
+                }
+                else
+                {
+                    p2mt = p2m_populate_on_demand;
+                    goto out;
+                }
+            }
+
+            goto pod_retry_l1;
+        }

-        if ( (ret == 0) && (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
-             (l2e_get_flags(l2e) & _PAGE_PSE) )
+        if (l2e_get_flags(l2e) & _PAGE_PSE)
         {
             p2mt = p2m_flags_to_type(l2e_get_flags(l2e));
             ASSERT(l2e_get_pfn(l2e) != INVALID_MFN || !p2m_is_ram(p2mt));
+
             if ( p2m_is_valid(p2mt) )
                 mfn = _mfn(l2e_get_pfn(l2e) + l1_table_offset(addr));
             else
                 p2mt = p2m_mmio_dm;
+
+            goto out;
         }
-        else
-        {
-
-            /* Need to __copy_from_user because the p2m is sparse and this
-             * part might not exist */
-            ret = __copy_from_user(&l1e,
-                                   &phys_to_machine_mapping[gfn],
-                                   sizeof(l1e));
+
+        /*
+         * Read and process L1
+         */
+
+        /* Need to __copy_from_user because the p2m is sparse and this
+         * part might not exist */
+    pod_retry_l1:
+        p2m_entry = &phys_to_machine_mapping[gfn];
+
+        ret = __copy_from_user(&l1e,
+                               p2m_entry,
+                               sizeof(l1e));

-            if ( ret == 0 ) {
-                p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
-                ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
-                if ( p2m_is_valid(p2mt) )
-                    mfn = _mfn(l1e_get_pfn(l1e));
-                else
-                    /* XXX see above */
-                    p2mt = p2m_mmio_dm;
+        if ( ret == 0 ) {
+            p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
+            ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
+
+            if ( p2m_flags_to_type(l1e_get_flags(l1e))
+                 == p2m_populate_on_demand )
+            {
+                /* The read has succeeded, so we know that the mapping
+                 * exits at this point.  */
+                if ( q != p2m_query )
+                {
+                    if( !p2m_pod_demand_populate(current->domain, gfn, mfn,
+                                                 (l1_pgentry_t *)p2m_entry, 0,
+                                                 q) )
+                        goto pod_retry_l1;
+
+                    /* Allocate failed. */
+                    p2mt = p2m_invalid;
+                    goto out;
+                }
+                else
+                {
+                    p2mt = p2m_populate_on_demand;
+                    goto out;
+                }
             }
+
+            if ( p2m_is_valid(p2mt) )
+                mfn = _mfn(l1e_get_pfn(l1e));
+            else
+                /* XXX see above */
+                p2mt = p2m_mmio_dm;
         }
     }
-
+out:
     *t = p2mt;
     return mfn;
 }
@@ -510,6 +840,8 @@
     memset(p2m, 0, sizeof(*p2m));
     p2m_lock_init(p2m);
     INIT_LIST_HEAD(&p2m->pages);
+    INIT_LIST_HEAD(&p2m->pod.super);
+    INIT_LIST_HEAD(&p2m->pod.single);

     p2m->set_entry = p2m_set_entry;
     p2m->get_entry = p2m_gfn_to_mfn;
@@ -680,6 +1012,7 @@
     struct page_info *page;
     struct domain *od;
     unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
+    int entry_count = 0;
     mfn_t p2mfn;
     unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
     int test_linear;
@@ -809,6 +1142,10 @@
                 {
                     if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
                     {
+                        if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE)
+                             && ( p2m_flags_to_type(l2e_get_flags(l2e[i2]))
+                                  == p2m_populate_on_demand ) )
+                            entry_count+=(1<<9);
                         gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
                         continue;
                     }
@@ -839,13 +1176,20 @@
                     for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
                     {
                         if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
+                        {
+                            if ( p2m_flags_to_type(l1e_get_flags(l1e[i1]))
+                                 == p2m_populate_on_demand )
+                            entry_count++;
                             continue;
+                        }
                         mfn = l1e_get_pfn(l1e[i1]);
                         ASSERT(mfn_valid(_mfn(mfn)));
                         m2pfn = get_gpfn_from_mfn(mfn);
                         if ( m2pfn != gfn )
                         {
                             pmbad++;
+                            printk("mismatch: gfn %#lx -> mfn %#lx"
+                                   " -> gfn %#lx\n", gfn, mfn, m2pfn);
                             P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
                                        " -> gfn %#lx\n", gfn, mfn, m2pfn);
                             BUG();
@@ -868,6 +1212,15 @@

     }

+    if ( entry_count != d->arch.p2m->pod.entry_count )
+    {
+        printk("%s: refcounted entry count %d, audit count %d!\n",
+               __func__,
+               d->arch.p2m->pod.entry_count,
+               entry_count);
+        BUG();
+    }
+
     //P2M_PRINTK("p2m audit complete\n");
     //if ( orphans_i | orphans_d | mpbad | pmbad )
     //    P2M_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
@@ -876,8 +1229,6 @@
         P2M_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
                    pmbad, mpbad);
 }
-#else
-#define audit_p2m(_d) do { (void)(_d); } while(0)
 #endif /* P2M_AUDIT */


@@ -915,6 +1266,77 @@
 }

 int
+guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
+                                      unsigned int order)
+{
+    struct p2m_domain *p2md = d->arch.p2m;
+    unsigned long i;
+    p2m_type_t ot;
+    mfn_t omfn;
+    int pod_count = 0;
+    int rc = 0;
+
+    BUG_ON(!paging_mode_translate(d));
+
+#if CONFIG_PAGING_LEVELS == 3
+    /*
+     * 32bit PAE nested paging does not support over 4GB guest due to
+     * hardware translation limit. This limitation is checked by comparing
+     * gfn with 0xfffffUL.
+     */
+    if ( paging_mode_hap(d) && (gfn > 0xfffffUL) )
+    {
+        if ( !test_and_set_bool(d->arch.hvm_domain.svm.npt_4gb_warning) )
+            dprintk(XENLOG_WARNING, "Dom%d failed to populate memory beyond"
+                    " 4GB: specify 'hap=0' domain config option.\n",
+                    d->domain_id);
+        return -EINVAL;
+    }
+#endif
+
+    p2m_lock(p2md);
+    audit_p2m(d);
+
+    P2M_DEBUG("adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
+
+    /* Make sure all gpfns are unused */
+    for ( i = 0; i < (1UL << order); i++ )
+    {
+        omfn = gfn_to_mfn_query(d, gfn + i, &ot);
+        if ( p2m_is_ram(ot) )
+        {
+            printk("%s: gfn_to_mfn returned type %d!\n",
+                   __func__, ot);
+            rc = -EBUSY;
+            goto out;
+        }
+        else if ( ot == p2m_populate_on_demand )
+        {
+            /* Count how man PoD entries we'll be replacing if successful */
+            pod_count++;
+        }
+    }
+
+    /* Now, actually do the two-way mapping */
+    if ( !set_p2m_entry(d, gfn, _mfn(POPULATE_ON_DEMAND_MFN), order,
+                        p2m_populate_on_demand) )
+        rc = -EINVAL;
+    else
+    {
+        p2md->pod.entry_count += 1 << order; /* Lock: p2m */
+        p2md->pod.entry_count -= pod_count;
+        BUG_ON(p2md->pod.entry_count < 0);
+    }
+
+    audit_p2m(d);
+    p2m_unlock(p2md);
+
+out:
+    return rc;
+
+}
+
+int
 guest_physmap_add_entry(struct domain *d, unsigned long gfn,
                         unsigned long mfn, unsigned int page_order,
                         p2m_type_t t)
@@ -922,6 +1344,7 @@
     unsigned long i, ogfn;
     p2m_type_t ot;
     mfn_t omfn;
+    int pod_count = 0;
     int rc = 0;

     if ( !paging_mode_translate(d) )
@@ -969,6 +1392,11 @@
             ASSERT(mfn_valid(omfn));
             set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
         }
+        else if ( ot == p2m_populate_on_demand )
+        {
+            /* Count how man PoD entries we'll be replacing if successful */
+            pod_count++;
+        }
     }

     /* Then, look for m->p mappings for this range and deal with them */
@@ -1015,6 +1443,11 @@
         if ( !set_p2m_entry(d, gfn, _mfn(INVALID_MFN), page_order,
                             p2m_invalid) )
             rc = -EINVAL;
+        else
+        {
+            d->arch.p2m->pod.entry_count -= pod_count; /* Lock: p2m */
+            BUG_ON(d->arch.p2m->pod.entry_count < 0);
+        }
     }

     audit_p2m(d);
diff -r e61e4075a710 xen/arch/x86/mm/paging.c
--- a/xen/arch/x86/mm/paging.c  Wed Dec 17 12:59:51 2008 +0000
+++ b/xen/arch/x86/mm/paging.c  Fri Dec 19 17:51:53 2008 +0000
@@ -585,6 +585,9 @@

     /* clean up log dirty resources. */
     paging_log_dirty_teardown(d);
+
+    /* Move populate-on-demand cache back to domain_list for destruction */
+    p2m_pod_empty_cache(d);
 }

 /* Call once all of the references to the domain have gone away */
diff -r e61e4075a710 xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c    Wed Dec 17 12:59:51 2008 +0000
+++ b/xen/arch/x86/mm/shadow/multi.c    Fri Dec 19 17:51:53 2008 +0000
@@ -2173,7 +2173,7 @@
         mfn_t gl3mfn = gfn_to_mfn_query(d, gl3gfn, &p2mt);
         if ( p2m_is_ram(p2mt) )
             sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
-        else
+        else if ( p2mt != p2m_populate_on_demand )
             result |= SHADOW_SET_ERROR;

 #if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
@@ -2230,7 +2230,7 @@
         mfn_t gl2mfn = gfn_to_mfn_query(v->domain, gl2gfn, &p2mt);
         if ( p2m_is_ram(p2mt) )
             sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
-        else
+        else if ( p2mt != p2m_populate_on_demand )
             result |= SHADOW_SET_ERROR;

 #if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
@@ -2278,8 +2278,8 @@
         {
             mfn_t gl1mfn = gfn_to_mfn_query(v->domain, gl1gfn, &p2mt);
             if ( p2m_is_ram(p2mt) )
-                sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
-            else
+                sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
+            else if ( p2mt != p2m_populate_on_demand )
                 result |= SHADOW_SET_ERROR;
         }
     }
diff -r e61e4075a710 xen/include/asm-x86/p2m.h
--- a/xen/include/asm-x86/p2m.h Wed Dec 17 12:59:51 2008 +0000
+++ b/xen/include/asm-x86/p2m.h Fri Dec 19 17:51:53 2008 +0000
@@ -64,6 +64,7 @@
     p2m_ram_ro = 3,             /* Read-only; writes are silently dropped */
     p2m_mmio_dm = 4,            /* Reads and write go to the device model */
     p2m_mmio_direct = 5,        /* Read/write mapping of genuine MMIO area */
+    p2m_populate_on_demand = 6, /* Place-holder for empty memory */
 } p2m_type_t;

 typedef enum {
@@ -88,11 +89,19 @@
 #define P2M_RO_TYPES (p2m_to_mask(p2m_ram_logdirty)     \
                       | p2m_to_mask(p2m_ram_ro))

+#define P2M_MAGIC_TYPES (p2m_to_mask(p2m_populate_on_demand))
+
 /* Useful predicates */
 #define p2m_is_ram(_t) (p2m_to_mask(_t) & P2M_RAM_TYPES)
 #define p2m_is_mmio(_t) (p2m_to_mask(_t) & P2M_MMIO_TYPES)
 #define p2m_is_readonly(_t) (p2m_to_mask(_t) & P2M_RO_TYPES)
+#define p2m_is_magic(_t) (p2m_to_mask(_t) & P2M_MAGIC_TYPES)
 #define p2m_is_valid(_t) (p2m_to_mask(_t) & (P2M_RAM_TYPES | P2M_MMIO_TYPES))
+
+/* Populate-on-demand */
+#define POPULATE_ON_DEMAND_MFN  (1<<9)
+#define POD_PAGE_ORDER 9
+

 struct p2m_domain {
     /* Lock that protects updates to the p2m */
@@ -122,6 +131,28 @@

     /* Highest guest frame that's ever been mapped in the p2m */
     unsigned long max_mapped_pfn;
+
+    /* Populate-on-demand variables
+     * NB on locking.  {super,single,count} are
+     * covered by d->page_alloc_lock, since they're almost always used in
+     * conjunction with that functionality.  {entry_count} is covered by
+     * the domain p2m lock, since it's almost always used in conjunction
+     * with changing the p2m tables.
+     *
+     * At this point, both locks are held in two places.  In both,
+     * the order is [p2m,page_alloc]:
+     * + p2m_pod_decrease_reservation() calls p2m_pod_cache_add(),
+     *   which grabs page_alloc
+     * + p2m_pod_demand_populate() grabs both; the p2m lock to avoid
+     *   double-demand-populating of pages, the page_alloc lock to
+     *   protect moving stuff from the PoD cache to the domain page list.
+     */
+    struct {
+        struct list_head super,        /* List of superpages                */
+                         single;       /* Non-super lists                   */
+        int              count,        /* # of pages in cache lists         */
+                         entry_count;  /* # of pages in p2m marked pod      */
+    } pod;
 };

 /* Extract the type from the PTE flags that store it */
@@ -220,10 +251,21 @@
 void p2m_teardown(struct domain *d);
 void p2m_final_teardown(struct domain *d);

+/* Dump PoD information about the domain */
+void p2m_pod_dump_data(struct domain *d);
+
+/* Move all pages from the populate-on-demand cache to the domain page_list
+ * (usually in preparation for domain destruction) */
+void p2m_pod_empty_cache(struct domain *d);
+
 /* Add a page to a domain's p2m table */
 int guest_physmap_add_entry(struct domain *d, unsigned long gfn,
                             unsigned long mfn, unsigned int page_order,
                             p2m_type_t t);
+
+/* Set a p2m range as populate-on-demand */
+int guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
+                                          unsigned int order);

 /* Untyped version for RAM only, for compatibility
  *

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
WARNING - OLD ARCHIVES

xen-devel

[Xen-devel] [RFC][PATCH] 3/9 Populate-on-demand memory: PoD core