[Xen-devel] [RFC][PATCH] 7/9 Populate-on-demand memory: Xen inte

Implement Xen interface to PoD functionality.
* Increase the number of MEMOP bits from 4 to 6 (increasing the number
of available memory operations from 16 to 64).
* Introduce XENMEMF_populate_on_demand, which will cause
populate_physmap() to fill a range with PoD entries rather than
backing it with ram
* Introduce XENMEM_[sg]et_pod_target operation to the memory
hypercall, to get and set PoD cache size.  set_pod_target() should be
called during domain creation, as well as after modifying the memory
target of any domain which may have outstanding PoD entries.

Signed-off-by: George Dunlap <george.dunlap@xxxxxxxxxxxxx>

diff -r 90feb993b0b8 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Fri Dec 19 17:54:23 2008 +0000
+++ b/xen/arch/x86/mm.c Tue Dec 23 11:35:30 2008 +0000
@@ -3976,6 +3976,49 @@
         return 0;
     }

+    case XENMEM_set_pod_target:
+    case XENMEM_get_pod_target:
+    {
+        xen_pod_target_t target;
+        struct domain *d;
+
+        /* Support DOMID_SELF? */
+        if ( !IS_PRIV(current->domain) )
+            return -EINVAL;
+
+        if ( copy_from_guest(&target, arg, 1) )
+            return -EFAULT;
+
+        rc = rcu_lock_target_domain_by_id(target.domid, &d);
+        if ( rc != 0 )
+            return rc;
+
+        if ( op == XENMEM_set_pod_target )
+        {
+            if ( target.target_pages > d->max_pages )
+            {
+                rc = -EINVAL;
+                goto pod_target_out_unlock;
+            }
+
+            rc = p2m_pod_set_mem_target(d, target.target_pages);
+        }
+
+        target.tot_pages       = d->tot_pages;
+        target.pod_cache_pages = d->arch.p2m->pod.count;
+        target.pod_entries     = d->arch.p2m->pod.entry_count;
+
+        if ( copy_to_guest(arg, &target, 1) )
+        {
+            rc= -EFAULT;
+            goto pod_target_out_unlock;
+        }
+
+    pod_target_out_unlock:
+        rcu_unlock_domain(d);
+        return rc;
+    }
+
     default:
         return subarch_memory_op(op, arg);
     }
diff -r 90feb993b0b8 xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c     Fri Dec 19 17:54:23 2008 +0000
+++ b/xen/arch/x86/mm/p2m.c     Tue Dec 23 11:35:30 2008 +0000
@@ -387,6 +387,150 @@
     return p;
 }

+/* Set the size of the cache, allocating or freeing as necessary. */
+static int
+p2m_pod_set_cache_target(struct domain *d, unsigned long pod_target)
+{
+    struct p2m_domain *p2md = d->arch.p2m;
+    int ret = 0;
+
+    /* Increasing the target */
+    while ( pod_target > p2md->pod.count )
+    {
+        struct page_info * page;
+        int order;
+
+        if ( (pod_target - p2md->pod.count) >= (1>>9) )
+            order = 9;
+        else
+            order = 0;
+
+        page = alloc_domheap_pages(d, order, 0);
+        if ( unlikely(page == NULL) )
+            goto out;
+
+        p2m_pod_cache_add(d, page, order);
+    }
+
+    /* Decreasing the target */
+    /* We hold the p2m lock here, so we don't need to worry about
+     * cache disappearing under our feet. */
+    while ( pod_target < p2md->pod.count )
+    {
+        struct page_info * page;
+        int order, i;
+
+        /* Grab the lock before checking that pod.super is empty, or the last
+         * entries may disappear before we grab the lock. */
+        spin_lock(&d->page_alloc_lock);
+
+        if ( (p2md->pod.count - pod_target) > (1>>9)
+             && !list_empty(&p2md->pod.super) )
+            order = 9;
+        else
+            order = 0;
+
+        page = p2m_pod_cache_get(d, order);
+
+        ASSERT(page != NULL);
+
+        spin_unlock(&d->page_alloc_lock);
+
+        /* Then free them */
+        for ( i = 0 ; i < (1 << order) ; i++ )
+        {
+            /* Copied from common/memory.c:guest_remove_page() */
+            if ( unlikely(!get_page(page+i, d)) )
+            {
+                gdprintk(XENLOG_INFO, "Bad page free for domain
%u\n", d->domain_id);
+                ret = -EINVAL;
+                goto out;
+            }
+
+            if ( test_and_clear_bit(_PGT_pinned,
&(page+i)->u.inuse.type_info) )
+                put_page_and_type(page+i);
+
+            if ( test_and_clear_bit(_PGC_allocated, &(page+i)->count_info) )
+                put_page(page+i);
+
+            put_page(page+i);
+        }
+    }
+
+out:
+    return ret;
+}
+
+/*
+ * The "right behavior" here requires some careful thought.  First, some
+ * definitions:
+ * + M: static_max
+ * + B: number of pages the balloon driver has ballooned down to.
+ * + P: Number of populated pages.
+ * + T: Old target
+ * + T': New target
+ *
+ * The following equations should hold:
+ *  0 <= P <= T <= B <= M
+ *  d->arch.p2m->pod.entry_count == B - P
+ *  d->tot_pages == P + d->arch.p2m->pod.count
+ *
+ * Now we have the following potential cases to cover:
+ *     B <T': Set the PoD cache size equal to the number of outstanding PoD
+ *   entries.  The balloon driver will deflate the balloon to give back
+ *   the remainder of the ram to the guest OS.
+ *  T <T'<B : Increase PoD cache size.
+ *  T'<T<=B : Here we have a choice.  We can decrease the size of the cache,
+ *   get the memory right away.  However, that means every time we
+ *   reduce the memory target we risk the guest attempting to populate the
+ *   memory before the balloon driver has reached its new target.  Safer to
+ *   never reduce the cache size here, but only when the balloon driver frees
+ *   PoD ranges.
+ *
+ * If there are many zero pages, we could reach the target also by doing
+ * zero sweeps and marking the ranges PoD; but the balloon driver will have
+ * to free this memory eventually anyway, so we don't actually gain that much
+ * by doing so.
+ *
+ * NB that the equation (B<T') may require adjustment to the cache
+ * size as PoD pages are freed as well; i.e., freeing a PoD-backed
+ * entry when pod.entry_count == pod.count requires us to reduce both
+ * pod.entry_count and pod.count.
+ */
+int
+p2m_pod_set_mem_target(struct domain *d, unsigned long target)
+{
+    unsigned pod_target;
+    struct p2m_domain *p2md = d->arch.p2m;
+    int ret = 0;
+    unsigned long populated;
+
+    /* P == B: Nothing to do. */
+    if ( p2md->pod.entry_count == 0 )
+        goto out;
+
+    /* T' < B: Don't reduce the cache size; let the balloon driver
+     * take care of it. */
+    if ( target < d->tot_pages )
+        goto out;
+
+    populated  = d->tot_pages - p2md->pod.count;
+
+    pod_target = target - populated;
+
+    /* B < T': Set the cache size equal to # of outstanding entries,
+     * let the balloon driver fill in the rest. */
+    if ( pod_target > p2md->pod.entry_count )
+        pod_target = p2md->pod.entry_count;
+
+    ASSERT( pod_target > p2md->pod.count );
+
+    ret = p2m_pod_set_cache_target(d, pod_target);
+
+out:
+    return ret;
+}
+
 void
 p2m_pod_empty_cache(struct domain *d)
 {
@@ -537,6 +681,13 @@
             ram--;
         }
     }
+
+    /* If we've reduced our "liabilities" beyond our "assets", free some */
+    if ( p2md->pod.entry_count < p2md->pod.count )
+    {
+        printk("b %d\n", p2md->pod.entry_count);
+        p2m_pod_set_cache_target(d, p2md->pod.entry_count);
+    }

     /* If there are no more non-PoD entries, tell decrease_reservation() that
      * there's nothing left to do. */
@@ -786,7 +937,7 @@
         /* Stop if we're past our limit and we have found *something*.
          *
          * NB that this is a zero-sum game; we're increasing our cache size
-         * by re-increasing our 'debt'.  Since we hold the p2m lock,
+         * by increasing our 'debt'.  Since we hold the p2m lock,
          * (entry_count - count) must remain the same. */
         if ( !list_empty(&p2md->pod.super) &&  i < limit )
             break;
diff -r 90feb993b0b8 xen/arch/x86/x86_64/compat/mm.c
--- a/xen/arch/x86/x86_64/compat/mm.c   Fri Dec 19 17:54:23 2008 +0000
+++ b/xen/arch/x86/x86_64/compat/mm.c   Tue Dec 23 11:35:30 2008 +0000
@@ -122,6 +122,29 @@
 #define XLAT_memory_map_HNDL_buffer(_d_, _s_) ((void)0)
         XLAT_memory_map(&cmp, nat);
 #undef XLAT_memory_map_HNDL_buffer
+        if ( copy_to_guest(arg, &cmp, 1) )
+            rc = -EFAULT;
+
+        break;
+    }
+
+    case XENMEM_set_pod_target:
+    case XENMEM_get_pod_target:
+    {
+        struct compat_pod_target cmp;
+        struct xen_pod_target *nat = (void *)COMPAT_ARG_XLAT_VIRT_BASE;
+
+        if ( copy_from_guest(&cmp, arg, 1) )
+            return -EFAULT;
+
+        XLAT_pod_target(nat, &cmp);
+
+        rc = arch_memory_op(op, guest_handle_from_ptr(nat, void));
+        if ( rc < 0 )
+            break;
+
+        XLAT_pod_target(&cmp, nat);
+
         if ( copy_to_guest(arg, &cmp, 1) )
             rc = -EFAULT;

diff -r 90feb993b0b8 xen/common/memory.c
--- a/xen/common/memory.c       Fri Dec 19 17:54:23 2008 +0000
+++ b/xen/common/memory.c       Tue Dec 23 11:35:30 2008 +0000
@@ -111,31 +111,40 @@
         if ( unlikely(__copy_from_guest_offset(&gpfn, a->extent_list, i, 1)) )
             goto out;

-        page = alloc_domheap_pages(d, a->extent_order, a->memflags);
-        if ( unlikely(page == NULL) )
+        if ( a->memflags & MEMF_populate_on_demand )
         {
-            gdprintk(XENLOG_INFO, "Could not allocate order=%d extent: "
-                     "id=%d memflags=%x (%ld of %d)\n",
-                     a->extent_order, d->domain_id, a->memflags,
-                     i, a->nr_extents);
-            goto out;
+            if ( guest_physmap_mark_populate_on_demand(d, gpfn,
+                                                       a->extent_order) < 0 )
+                goto out;
         }
+        else
+        {
+            page = alloc_domheap_pages(d, a->extent_order, a->memflags);
+            if ( unlikely(page == NULL) )
+            {
+                gdprintk(XENLOG_INFO, "Could not allocate order=%d extent: "
+                         "id=%d memflags=%x (%ld of %d)\n",
+                         a->extent_order, d->domain_id, a->memflags,
+                         i, a->nr_extents);
+                goto out;
+            }

-        mfn = page_to_mfn(page);
-        guest_physmap_add_page(d, gpfn, mfn, a->extent_order);
+            mfn = page_to_mfn(page);
+            guest_physmap_add_page(d, gpfn, mfn, a->extent_order);

-        if ( !paging_mode_translate(d) )
-        {
-            for ( j = 0; j < (1 << a->extent_order); j++ )
-                set_gpfn_from_mfn(mfn + j, gpfn + j);
+            if ( !paging_mode_translate(d) )
+            {
+                for ( j = 0; j < (1 << a->extent_order); j++ )
+                    set_gpfn_from_mfn(mfn + j, gpfn + j);

-            /* Inform the domain of the new page's machine address. */
-            if ( unlikely(__copy_to_guest_offset(a->extent_list, i, &mfn, 1)) )
-                goto out;
+                /* Inform the domain of the new page's machine address. */
+                if ( unlikely(__copy_to_guest_offset(a->extent_list,
i, &mfn, 1)) )
+                    goto out;
+            }
         }
     }

- out:
+out:
     a->nr_done = i;
 }

@@ -527,6 +536,10 @@

         args.memflags |= MEMF_node(XENMEMF_get_node(reservation.mem_flags));

+        if ( op == XENMEM_populate_physmap
+             && (reservation.mem_flags & XENMEMF_populate_on_demand) )
+            args.memflags |= MEMF_populate_on_demand;
+
         if ( likely(reservation.domid == DOMID_SELF) )
         {
             d = rcu_lock_current_domain();
diff -r 90feb993b0b8 xen/include/asm-x86/p2m.h
--- a/xen/include/asm-x86/p2m.h Fri Dec 19 17:54:23 2008 +0000
+++ b/xen/include/asm-x86/p2m.h Tue Dec 23 11:35:30 2008 +0000
@@ -261,6 +261,10 @@
  * (usually in preparation for domain destruction) */
 void p2m_pod_empty_cache(struct domain *d);

+/* Set populate-on-demand cache size so that the total memory allocated to a
+ * domain matches target */
+int p2m_pod_set_mem_target(struct domain *d, unsigned long target);
+
 /* Call when decreasing memory reservation to handle PoD entries properly.
  * Will return '1' if all entries were handled and nothing more need be done.*/
 int
diff -r 90feb993b0b8 xen/include/public/memory.h
--- a/xen/include/public/memory.h       Fri Dec 19 17:54:23 2008 +0000
+++ b/xen/include/public/memory.h       Tue Dec 23 11:35:30 2008 +0000
@@ -48,6 +48,8 @@
 /* NUMA node to allocate from. */
 #define XENMEMF_node(x)     (((x) + 1) << 8)
 #define XENMEMF_get_node(x) ((((x) >> 8) - 1) & 0xffu)
+/* Flag to populate physmap with populate-on-demand entries */
+#define XENMEMF_populate_on_demand (1<<16)
 #endif

 struct xen_memory_reservation {
@@ -299,6 +301,19 @@
 typedef struct xen_foreign_memory_map xen_foreign_memory_map_t;
 DEFINE_XEN_GUEST_HANDLE(xen_foreign_memory_map_t);

+#define XENMEM_set_pod_target       16
+#define XENMEM_get_pod_target       17
+struct xen_pod_target {
+    /* IN */
+    uint64_t target_pages;
+    /* OUT */
+    uint64_t tot_pages;
+    uint64_t pod_cache_pages;
+    uint64_t pod_entries;
+    /* IN */
+    domid_t domid;
+};
+typedef struct xen_pod_target xen_pod_target_t;
 #endif /* __XEN_PUBLIC_MEMORY_H__ */

 /*
diff -r 90feb993b0b8 xen/include/xen/hypercall.h
--- a/xen/include/xen/hypercall.h       Fri Dec 19 17:54:23 2008 +0000
+++ b/xen/include/xen/hypercall.h       Tue Dec 23 11:35:30 2008 +0000
@@ -48,7 +48,7 @@
  * at what point in the page list to resume. For this purpose I steal the
  * high-order bits of the @cmd parameter, which are otherwise unused and zero.
  */
-#define MEMOP_EXTENT_SHIFT 4 /* cmd[:4] == start_extent */
+#define MEMOP_EXTENT_SHIFT 6 /* cmd[:6] == start_extent */
 #define MEMOP_CMD_MASK     ((1 << MEMOP_EXTENT_SHIFT) - 1)

 extern long
diff -r 90feb993b0b8 xen/include/xen/mm.h
--- a/xen/include/xen/mm.h      Fri Dec 19 17:54:23 2008 +0000
+++ b/xen/include/xen/mm.h      Tue Dec 23 11:35:30 2008 +0000
@@ -72,6 +72,8 @@
 /* memflags: */
 #define _MEMF_no_refcount 0
 #define  MEMF_no_refcount (1U<<_MEMF_no_refcount)
+#define _MEMF_populate_on_demand 1
+#define  MEMF_populate_on_demand (1U<<_MEMF_populate_on_demand)
 #define _MEMF_node        8
 #define  MEMF_node(n)     ((((n)+1)&0xff)<<_MEMF_node)
 #define _MEMF_bits        24
diff -r 90feb993b0b8 xen/include/xlat.lst
--- a/xen/include/xlat.lst      Fri Dec 19 17:54:23 2008 +0000
+++ b/xen/include/xlat.lst      Tue Dec 23 11:35:30 2008 +0000
@@ -38,6 +38,7 @@
 !      memory_exchange                 memory.h
 !      memory_map                      memory.h
 !      memory_reservation              memory.h
+!      pod_target                      memory.h
 !      translate_gpfn_list             memory.h
 !      sched_poll                      sched.h
 ?      sched_remote_shutdown           sched.h

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
WARNING - OLD ARCHIVES

xen-devel

[Xen-devel] [RFC][PATCH] 7/9 Populate-on-demand memory: Xen interface