vNUMA : Implement allocation strategies

diff --git a/tools/libxc/Makefile b/tools/libxc/Makefile
--- a/tools/libxc/Makefile
+++ b/tools/libxc/Makefile
@@ -28,6 +28,7 @@ CTRL_SRCS-y       += xc_mem_event.c
 CTRL_SRCS-y       += xc_mem_paging.c
 CTRL_SRCS-y       += xc_memshr.c
 CTRL_SRCS-y       += xc_cpumap.c
+CTRL_SRCS-y       += xc_dom_numa.c
 CTRL_SRCS-y       += xtl_core.c
 CTRL_SRCS-y       += xtl_logger_stdio.c
 CTRL_SRCS-$(CONFIG_X86) += xc_pagetab.c
diff --git a/tools/libxc/xc_dom_numa.c b/tools/libxc/xc_dom_numa.c
new file mode 100644
--- /dev/null
+++ b/tools/libxc/xc_dom_numa.c
@@ -0,0 +1,901 @@
+/* XEN Guest NUMA support
+ * Author : Dulloor (dulloor@xxxxxxxxxx) */
+
+#include <string.h>
+#include <stdint.h>
+#include "xg_private.h"
+#include "xc_dom_numa.h"
+#include "xc_cpumap.h"
+
+#ifdef __DOM_NUMA_DEBUG__
+#undef DBGPRINTF
+#define DBGPRINTF(_f, _a...) xc_report(xch, xch->error_handler, XTL_INFO,0, _f , ## _a)
+#endif
+
+#define XC_MAX_NODES 16
+struct xc_node_data {
+    uint32_t node_id;
+    uint64_t size_pages;
+    uint64_t free_pages;
+    xc_cpumask_t cpu_mask; /* node_to_cpumask */
+};
+typedef struct xc_node_data xc_node_data_t;
+
+struct xc_machine_numa_layout {
+    uint64_t size_pages;
+    uint64_t free_pages;
+
+    uint32_t nr_nodes;
+
+    /* Only (nr_nodes*nr_nodes) entries are filled */
+    uint32_t node_distance[XC_MAX_NODES*XC_MAX_NODES];
+    /* Only (nr_nodes) entries are filled */
+    xc_node_data_t node_data[XC_MAX_NODES];
+};
+typedef struct xc_machine_numa_layout xc_machine_numa_layout_t;
+
+/* XXX: Move all sanity checks to this funtion */
+#define XC_DOM_NUMA_MIN_STRIPE  256
+xc_domain_numa_layout_t * xc_dom_alloc_numa_layout(xc_interface *xch, 
+        uint32_t domid, uint64_t nr_pages, xc_domain_numa_config_t *config)
+{
+    xc_domain_numa_layout_t *dom_layout;
+
+    if (config->strategy == XC_DOM_NUMA_NONE)
+    {
+        IPRINTF("%s: NUMA memory allocation disabled\n", __FUNCTION__);
+        return 0;
+    }
+    if (!(dom_layout = (xc_domain_numa_layout_t *)malloc(sizeof(*dom_layout))))
+    {
+        ERROR("%s: dom_layout allocation failed\n", __FUNCTION__);
+        return dom_layout;
+    }
+
+    DBGPRINTF("%s: dom_layout allocated\n", __FUNCTION__);
+    memset(dom_layout, 0, sizeof(*dom_layout));
+
+    dom_layout->version = XEN_DOM_NUMA_INTERFACE_VERSION;
+    dom_layout->nr_pages = nr_pages;
+    dom_layout->nr_vnodes = config->nr_nodes;
+
+    /* Internal data */
+    dom_layout->domid = domid;
+    dom_layout->strategy = config->strategy;
+    dom_layout->stripe_size = config->stripe_size;
+    if (dom_layout->stripe_size && 
+                        (dom_layout->stripe_size < XC_DOM_NUMA_MIN_STRIPE))
+    {
+        dom_layout->stripe_size = XC_DOM_NUMA_MIN_STRIPE;
+        IPRINTF("%s: Min STRIPE size is %d pages\n", 
+                                        __FUNCTION__, dom_layout->stripe_size);
+    }
+    return dom_layout;
+}
+
+void
+xc_dom_free_numa_layout(xc_interface *xch, xc_domain_numa_layout_t *dom_layout)
+{
+    DBGPRINTF("%s: dom_layout freed\n", __FUNCTION__);
+    free(dom_layout);
+}
+
+#define XC_DUMP_STR_SZ  (8192)
+static void
+xc_dump_dom_numa_layout(xc_interface *xch, xc_domain_numa_layout_t *layout)
+{
+    unsigned int i, j;
+    char *xc_dump_str, *dumpstr;
+    if (!(xc_dump_str = malloc(XC_DUMP_STR_SZ)))
+    {
+        DBGPRINTF("%s : dump_str allocation failed", __FUNCTION__);
+        return;
+    }
+    dumpstr = xc_dump_str;
+    dumpstr += sprintf(dumpstr, 
+                        "NUMA-LAYOUT(Dom %d) : vcpus(%u), vnodes(%u)",
+                        layout->domid, layout->nr_vcpus, layout->nr_vnodes);
+    switch (layout->type)
+    {
+        case XEN_DOM_NUMA_CONFINE:
+            dumpstr += sprintf(dumpstr, ", type(CONFINE)\n");
+            break;
+        case XEN_DOM_NUMA_SPLIT:
+            dumpstr += sprintf(dumpstr, ", type(SPLIT)\n");
+            break;
+        case XEN_DOM_NUMA_STRIPE:
+            dumpstr += sprintf(dumpstr, ", type(STRIPE)\n");
+            break;
+        case XEN_DOM_NUMA_DONTCARE:
+            dumpstr += sprintf(dumpstr, ", type(DONTCARE)\n");
+            break;
+        default:
+            dumpstr += sprintf(dumpstr, ", type(UNDEFINED)\n");
+    }
+    for (i = 0; i < layout->nr_vnodes; i++)
+    {
+        xc_vnode_data_t *vnode_data = &layout->vnode_data[i];
+        dumpstr += sprintf(dumpstr, "vnode[%u]:mnode(%u), node_nr_pages(%x)", 
+                vnode_data->vnode_id, vnode_data->mnode_id,
+                vnode_data->nr_pages);
+        if (layout->type == XEN_DOM_NUMA_SPLIT)
+        {
+            char mapstr[128] = "";
+            struct xenctl_cpumap cpumap;
+            xc_cpumap_from_cpumask(&cpumap, &vnode_data->vcpu_mask);
+            xc_cpumap_snprintf(mapstr, sizeof(mapstr), cpumap);
+            dumpstr += sprintf(dumpstr, ", vcpu_mask(%s)", mapstr);
+        }
+        dumpstr += sprintf(dumpstr, "\n");
+    }
+
+    if (layout->type == XEN_DOM_NUMA_CONFINE)
+        goto done;
+    dumpstr += sprintf(dumpstr, "vnode distances :\n");
+    for (i = 0; i < layout->nr_vnodes; i++)
+        dumpstr += sprintf(dumpstr, "\tvnode[%u]", i);
+    for (i = 0; i < layout->nr_vnodes; i++)
+    {
+        dumpstr += sprintf(dumpstr, "\nvnode[%u]", i);
+        for (j = 0; j < layout->nr_vnodes; j++)
+            dumpstr += sprintf(dumpstr, "\t%u",
+                            layout->vnode_distance[i*layout->nr_vnodes + j]);
+        dumpstr += sprintf(dumpstr, "\n");
+    }
+done:
+    IPRINTF("%s", xc_dump_str);
+    free(xc_dump_str);
+    return;
+}
+
+
+static int
+xc_get_machine_numa_layout(xc_interface *xch, xc_machine_numa_layout_t *layout)
+{
+    uint32_t i, nr_nodes, nr_cpus;
+    xc_numainfo_t ninfo = { 0 };
+    uint64_t node_memsize[XC_MAX_NODES];
+    uint64_t node_memfree[XC_MAX_NODES];
+    xc_topologyinfo_t tinfo = { 0 };
+    uint32_t cpu_to_node[XC_CPUMASK_NR_CPUS];
+
+	memset(layout, 0, sizeof(*layout));
+	memset(node_memsize, 0, sizeof(uint64_t)*XC_MAX_NODES);
+	memset(node_memfree, 0, sizeof(uint64_t)*XC_MAX_NODES);
+
+    set_xen_guest_handle(ninfo.node_to_memsize, node_memsize);
+    set_xen_guest_handle(ninfo.node_to_memfree, node_memfree);
+    /* Read directly into layout's structure */
+    set_xen_guest_handle(ninfo.node_to_node_distance, layout->node_distance);
+    ninfo.max_node_index = XC_MAX_NODES-1;
+    if (xc_numainfo(xch, &ninfo))
+    {
+        ERROR("%s: xc_numainfo failed", __FUNCTION__);
+        return -1;
+    }
+    /* No need to check if a node is invalid, as in that case
+     * the size would be zero and it would never get selected*/
+    nr_nodes = ninfo.max_node_index + 1;
+    if ( nr_nodes > XC_MAX_NODES )
+        nr_nodes = XC_MAX_NODES;
+
+
+    set_xen_guest_handle(tinfo.cpu_to_core, NULL);
+    set_xen_guest_handle(tinfo.cpu_to_socket, NULL);
+    set_xen_guest_handle(tinfo.cpu_to_node, cpu_to_node);
+    tinfo.max_cpu_index = XC_CPUMASK_NR_CPUS-1;
+
+    if (xc_topologyinfo(xch, &tinfo))
+    {
+        ERROR("%s: xc_topologyinfo failed", __FUNCTION__);
+        return -1;
+    }
+
+    nr_cpus = tinfo.max_cpu_index+1;
+    if (nr_cpus > XC_CPUMASK_NR_CPUS)
+        nr_cpus = XC_CPUMASK_NR_CPUS;
+
+    layout->nr_nodes = nr_nodes;
+    for (i=0; i<nr_nodes; i++)
+    {
+        uint64_t size_pages, free_pages;
+        layout->node_data[i].node_id = i;
+        size_pages = (node_memsize[i] >> PAGE_SHIFT);
+        free_pages = (node_memfree[i] >> PAGE_SHIFT);
+        layout->node_data[i].size_pages = size_pages;
+        layout->node_data[i].free_pages = free_pages;
+        layout->size_pages += size_pages;
+        layout->free_pages += free_pages;
+    }
+
+    for (i=0; i<nr_cpus; i++)
+    {
+        struct xenctl_cpumap cpumap;
+        xc_cpumask_t *cpumask;
+
+        if (cpu_to_node[i] == INVALID_TOPOLOGY_ID)
+            continue;
+        cpumask = &(layout->node_data[(cpu_to_node[i])].cpu_mask);
+        xc_cpumap_from_cpumask(&cpumap, cpumask);
+        xc_cpumap_set_cpu(i, cpumap);
+    }
+    return 0;
+}
+
+static int
+xc_get_max_vcpus(xc_interface *xch, uint32_t domid)
+{
+    DECLARE_DOMCTL;
+    domctl.cmd = XEN_DOMCTL_getdomaininfo;
+    domctl.domain = (domid_t)domid;
+    return ((do_domctl(xch, &domctl) < 0)
+            ? 0 : (domctl.u.getdomaininfo.max_vcpu_id+1));
+}
+
+/* The function makes a (greedy) best fit selection of num_vnodes of
+ * vnode_size each. The number of pages selected from each node are returned
+ * in the node_pages_selected array.
+ * The best_fit ranking is based on the fraction(up to 1024 parts) of node
+ * memory occupied, if the node is selected.
+ * Returns 0 on success and 1 if selection fails. */
+/* XXX: Node selection needs more research/experience. */
+static int xc_select_best_fit_nodes(
+        xc_interface *xch, xc_machine_numa_layout_t *phys_layout,
+        uint32_t num_vnodes, uint64_t vnode_pages, uint64_t *nodes_pages)
+{
+    int i, num_nodes_selected;
+    uint64_t best_fit_rank;
+
+    DBGPRINTF("%s: called\n", __FUNCTION__);
+#define INVALID_NODE (~0)
+#define NODE_FIT_RANK_SHIFT (10)
+    best_fit_rank = 0;
+	num_nodes_selected = 0;
+
+    do {
+        int selected_node = INVALID_NODE;
+        for (i=0; i<phys_layout->nr_nodes; i++)
+        {
+            xc_node_data_t *node_data;
+            uint64_t node_sizepages, node_freepages;
+            uint64_t node_fit_rank;
+
+            /* Node is already selected */
+            if (nodes_pages[i])
+                continue;
+
+            node_data = &phys_layout->node_data[i];
+            node_sizepages = node_data->size_pages;
+            node_freepages = node_data->free_pages;
+
+            if (node_freepages < vnode_pages)
+                continue;
+
+            node_fit_rank = ((node_sizepages-node_freepages-vnode_pages)
+                                    << NODE_FIT_RANK_SHIFT) / node_sizepages;
+
+            if (node_fit_rank > best_fit_rank)
+                selected_node = i;
+        }
+
+        /* Nodes could not be selected. Bail out ! */
+        if (selected_node == INVALID_NODE)
+            return -1;
+
+        nodes_pages[selected_node] = vnode_pages;
+        num_nodes_selected++;
+    } while(num_nodes_selected < num_vnodes);
+#undef NODE_FIT_RANK_SHIFT
+#undef INVALID_NODE
+    return 0;
+}
+
+/* Sort the phys nodes in the decreasing order of free node memory */
+static void xc_sort_nodeload(xc_machine_numa_layout_t *phys_layout)
+{
+    int i, j;
+    uint32_t nr_nodes;
+
+    nr_nodes = phys_layout->nr_nodes;
+
+    for (i = 0; i < nr_nodes; i++)
+    {
+        uint64_t i_node_free = phys_layout->node_data[i].free_pages; 
+        for (j = i+1; j < nr_nodes; j++)
+        {
+            uint64_t j_node_free = phys_layout->node_data[j].free_pages; 
+            if (i_node_free > j_node_free)
+            {
+                xc_node_data_t tmp_node_data;
+                tmp_node_data = phys_layout->node_data[i];
+                phys_layout->node_data[i] = phys_layout->node_data[j];
+                phys_layout->node_data[j] = tmp_node_data;
+            }
+        }
+    }
+
+    return;
+}
+
+/* The function selects the nodes in the increasing order of free node memory,
+ * and fills them. The physical memory map for such a domain is striped 
+ * across all the selected nodes. 
+ * The phys_layout node_data structures could be sorted inplace. So, we 
+ * should always use node_data->node_id while using the node_distance array. 
+ * Returns the number of nodes selected. */
+static int xc_select_max_fit_nodes(
+        xc_interface *xch, xc_machine_numa_layout_t *phys_layout,
+                                    uint64_t dom_pages, uint64_t *node_pages)
+{
+    int i;
+    uint64_t dom_alloc_pages;
+
+    DBGPRINTF("%s: called\n", __FUNCTION__);
+    xc_sort_nodeload(phys_layout);
+
+    dom_alloc_pages = 0;
+    for (i=0; i<phys_layout->nr_nodes; i++)
+    {
+        xc_node_data_t *node_data;
+        uint64_t node_freepages;
+
+        node_data = &phys_layout->node_data[i];
+
+        /* In max-fit, if we try to pack the nodes too aggressively
+         * we might fail on any small allocation (from xen node heaps).
+		 * That's why, with DEFAULT, we don't use exact_node flag. */
+        node_freepages = node_data->free_pages;
+        if (!node_freepages)
+            continue;
+
+        if (node_freepages > (dom_pages-dom_alloc_pages))
+            node_freepages = (dom_pages-dom_alloc_pages);
+
+        node_pages[i] = node_freepages;
+        dom_alloc_pages += node_freepages;
+    }
+    if (dom_alloc_pages != dom_pages)
+    {
+        ERROR(
+                "%s: Failed to allocate memory. Maybe had to balloon more\n",
+                __FUNCTION__);
+        return -1;
+    }
+    return (i+1);
+}
+
+static int xc_setup_vnode_vcpu_masks(xc_domain_numa_layout_t *dom_layout)
+{
+    int vcpu;
+    for (vcpu=0; vcpu<dom_layout->nr_vcpus; vcpu++)
+    {
+        struct xenctl_cpumap vcpumap;
+        xc_cpumask_t *vcpumask;
+        int vnode = vcpu/(dom_layout->nr_vcpus/dom_layout->nr_vnodes);
+
+        vcpumask = &dom_layout->vnode_data[vnode].vcpu_mask;
+        xc_cpumap_from_cpumask(&vcpumap, vcpumask);
+        xc_cpumap_set_cpu(vcpu, vcpumap);
+    } 
+    return 0;    
+}
+
+static int xc_setup_vnode_distances(xc_machine_numa_layout_t *phys_layout, 
+                                        xc_domain_numa_layout_t *dom_layout)
+{
+    int vn1, vn2;
+    for (vn1=0; vn1<dom_layout->nr_vnodes; vn1++)
+    {
+        int n1 = dom_layout->vnode_data[vn1].mnode_id;
+        for (vn2=0; vn2<dom_layout->nr_vnodes; vn2++)
+        {
+            int n2 = dom_layout->vnode_data[vn2].mnode_id;
+            dom_layout->vnode_distance[(vn1*dom_layout->nr_vnodes)+vn2] =
+                phys_layout->node_distance[(n1*phys_layout->nr_nodes)+n2];
+        
+        }
+    }
+    return 0;
+}
+
+/* We require the vnodes to be aligned to 1GB 
+ * SHIFT values for 4K pages */
+#define XC_VNODE_MIN_SHIFT   (XEN_MIN_VNODE_SHIFT-PAGE_SHIFT)
+#define XC_VNODE_MIN_SIZE   (1UL << XC_VNODE_MIN_SHIFT)
+#define XC_VNODE_MIN_MASK ~(XC_VNODE_MIN_SIZE-1)
+/* Because we are strict with the alignment, we boost the size 
+ * to account for the pages not seen in physmap (by 16MB for now). */
+#define XC_VNODE_BOOST_SIZE (4096)
+#define XC_VCPUS_PER_VNODE (1)
+#define XC_POWER_OF_2(x) (((x) & ((x) - 1)) == 0)
+
+static int xc_setup_domain_vnodes(xc_interface *xch,
+    xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout,
+	uint64_t *node_pages_selected)
+{
+	int i;
+    uint32_t vnode_id;
+
+    for (i=0, vnode_id=0; i<phys_layout->nr_nodes; i++)
+    {
+        xc_node_data_t *node_data;
+        xc_vnode_data_t *vnode_data;
+
+        if (!node_pages_selected[i])
+            continue;
+
+        node_data = &phys_layout->node_data[i];
+        vnode_data = &dom_layout->vnode_data[vnode_id];
+        vnode_data->vnode_id = vnode_id;
+        vnode_data->nr_pages = node_pages_selected[i];
+        vnode_data->mnode_id = node_data->node_id;
+        vnode_id++;
+    }
+    if (vnode_id != dom_layout->nr_vnodes)
+    {
+        ERROR("%s: Internal Error(vnode count mismatch) (%d/%d) !\n", 
+                                __FUNCTION__, vnode_id, dom_layout->nr_vnodes);
+        return -1;
+    }
+    /* vnodes are exposed to the guest only for SPLIT. */
+    if (xc_setup_vnode_vcpu_masks(dom_layout) || 
+            (xc_setup_vnode_distances(phys_layout, dom_layout)))
+    {
+        ERROR("%s: vnode setup failed !\n", __FUNCTION__);
+        return -1;
+    }
+
+    return 0;
+}
+
+static int xc_select_domain_prep(xc_interface *xch,
+    xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout)
+{
+    if (!dom_layout->nr_vnodes)
+    {
+        ERROR("%s: VM nr_vnodes configured incorrectly !\n", __FUNCTION__);
+        return -1; 
+    }
+
+    if (dom_layout->nr_pages > phys_layout->free_pages)
+    {
+        ERROR(
+            "%s: Not enough memory for pv (unlikely after balloon checks)\n",
+                __FUNCTION__);
+        return -1;
+    }
+
+    if (!(dom_layout->nr_vcpus = xc_get_max_vcpus(xch, dom_layout->domid)))
+    {
+        ERROR("%s: xc_get_max_vcpus failed !\n", __FUNCTION__);
+        return -1; 
+    }
+
+    if (dom_layout->nr_vcpus > XC_CPUMASK_NR_CPUS)
+    {
+        ERROR("%s: Failed - More than %d vcpus!\n",
+                                            __FUNCTION__,  XC_CPUMASK_NR_CPUS);
+        return -1; 
+    }
+
+    if (dom_layout->nr_vcpus < dom_layout->nr_vnodes )
+    {
+        ERROR("%s: VM (%d) - more vcpus(%d) than vnodes(%d)!\n",
+                __FUNCTION__, dom_layout->domid, dom_layout->nr_vcpus,
+                dom_layout->nr_vnodes);
+        return -1; 
+    }
+
+    return 0;
+}
+
+static int xc_select_domain_confine(xc_interface *xch,
+    xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout)
+{
+    uint64_t *node_pages_selected = 0;
+    int rc;
+
+    DBGPRINTF("%s: Called for VM %d\n", __FUNCTION__, dom_layout->domid);
+    if ((rc = xc_select_domain_prep(xch, phys_layout, dom_layout)))
+        return -1;
+
+    if (!(node_pages_selected = 
+                (uint64_t *)calloc(XC_MAX_NODES, sizeof(uint64_t))))
+    {
+        rc = -1;
+        ERROR("%s: node_pages allocation failed\n", __FUNCTION__);
+    	goto failed;
+    }
+	if ((rc = xc_select_best_fit_nodes(xch, phys_layout, 1, 
+                    dom_layout->nr_pages, node_pages_selected)))
+    {
+        ERROR("%s: Not enough memory for CONFINE (Had to balloon more ?)\n",
+                                                            __FUNCTION__);
+    	goto failed;
+    }
+
+    dom_layout->type = XEN_DOM_NUMA_CONFINE;
+    rc = xc_setup_domain_vnodes(xch, phys_layout, dom_layout, 
+                                                    node_pages_selected);
+    if (!rc)
+        DBGPRINTF("%s: Selected CONFINE for VM %d\n", 
+                                    __FUNCTION__, dom_layout->domid);
+failed:
+    if (node_pages_selected)
+        free(node_pages_selected);
+    return rc;
+}
+
+/* For the numa guests, we construct a symmetrical topology (wrt the 
+ * distribution of vcpus over vnodes).
+ * We require the numa guests to have (2^n) vcpus and (2^k) vnodes.
+ * Each vnode is then assigned 2^(n-k) vcpus, where (n>=k).
+ */
+static int xc_select_domain_split(xc_interface *xch,
+    xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout)
+{
+    uint64_t vnode_nr_pages, *node_pages_selected = 0;
+    int rc;
+
+    DBGPRINTF("%s: Called for VM %d\n", __FUNCTION__, dom_layout->domid);
+    if ((rc = xc_select_domain_prep(xch, phys_layout, dom_layout)))
+        return -1;
+
+    if (!XC_POWER_OF_2(dom_layout->nr_vcpus))
+    {
+        ERROR("%s: #vcpus != 2^n (disable numa split)\n", __FUNCTION__);
+		return -1;
+    }
+    if (!XC_POWER_OF_2(dom_layout->nr_vnodes))
+    {
+        ERROR("%s: #vnodes != 2^n (disable numa split)\n", __FUNCTION__);
+		return -1;
+    }
+	if (dom_layout->nr_vcpus < (dom_layout->nr_vnodes*XC_VCPUS_PER_VNODE))
+	{
+        ERROR("%s: Failed - Not enough vcpus (%d on %d)!\n",
+				__FUNCTION__, dom_layout->nr_vcpus, dom_layout->nr_vnodes);
+        return -1; 
+	}
+
+	vnode_nr_pages = 
+        (dom_layout->nr_pages+XC_VNODE_BOOST_SIZE)/dom_layout->nr_vnodes;
+    vnode_nr_pages &= XC_VNODE_MIN_MASK;
+	if (vnode_nr_pages < XC_VNODE_MIN_SIZE)
+	{
+        ERROR("%s: vnode_size(%lx)<min(%lx), nr_pages(%lx), nr_vnodes(%d)!\n",
+				__FUNCTION__, vnode_nr_pages, XC_VNODE_MIN_SIZE,
+                dom_layout->nr_pages, dom_layout->nr_vnodes);
+        return -1; 
+	}
+    dom_layout->nr_pages = vnode_nr_pages*dom_layout->nr_vnodes;
+
+    if (!(node_pages_selected = 
+                (uint64_t *)calloc(XC_MAX_NODES, sizeof(uint64_t))))
+    {
+        rc = -1;
+        ERROR("%s: node_pages allocation failed\n", __FUNCTION__);
+    	goto failed;
+    }
+	if ((rc = xc_select_best_fit_nodes(xch, phys_layout, dom_layout->nr_vnodes, 
+                    vnode_nr_pages, node_pages_selected)) != 0)
+    {
+        ERROR("%s: Not enough memory for SPLIT (Had to balloon more ?)\n",
+                                                            __FUNCTION__);
+    	goto failed;
+    }
+
+    dom_layout->nr_pages = dom_layout->nr_vnodes*vnode_nr_pages;
+    dom_layout->type = XEN_DOM_NUMA_SPLIT;
+    if ((rc = xc_setup_domain_vnodes(xch, phys_layout, dom_layout, 
+                                                    node_pages_selected)))
+        goto failed;
+
+    if ((rc = xc_domain_setmaxmem(xch, dom_layout->domid, 
+            (dom_layout->nr_pages+XC_VNODE_BOOST_SIZE)<<(PAGE_SHIFT-10))))
+        goto failed;
+
+    DBGPRINTF("%s: Selected SPLIT for VM %d\n", 
+                                    __FUNCTION__, dom_layout->domid);
+failed:
+    if (node_pages_selected)
+        free(node_pages_selected);
+    return rc;
+}
+
+static int xc_select_domain_stripe(xc_interface *xch,
+    xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout)
+{
+    uint64_t vnode_nr_pages, *node_pages_selected = 0;
+    int rc;
+
+    DBGPRINTF("%s: Called for VM %d\n", __FUNCTION__, dom_layout->domid);
+    if ((rc = xc_select_domain_prep(xch, phys_layout, dom_layout)))
+        return -1;
+
+	vnode_nr_pages = dom_layout->nr_pages/dom_layout->nr_vnodes;
+
+    if (!(node_pages_selected = 
+                (uint64_t *)calloc(XC_MAX_NODES, sizeof(uint64_t))))
+    {
+        rc = -1;
+        ERROR("%s: node_pages allocation failed\n", __FUNCTION__);
+    	goto failed;
+    }
+	if ((rc = xc_select_best_fit_nodes(xch, phys_layout, dom_layout->nr_vnodes, 
+                    vnode_nr_pages, node_pages_selected)) != 0)
+    {
+        ERROR("%s: Not enough memory for STRIPE (Had to balloon more ?)\n",
+                                                            __FUNCTION__);
+    	goto failed;
+    }
+
+    dom_layout->nr_pages = dom_layout->nr_vnodes*vnode_nr_pages;
+    dom_layout->type = XEN_DOM_NUMA_STRIPE;
+    rc = xc_setup_domain_vnodes(xch, phys_layout, dom_layout, 
+                                                    node_pages_selected);
+    if (!rc)
+        DBGPRINTF("%s: Selected STRIPE for VM %d\n", 
+                                    __FUNCTION__, dom_layout->domid);
+failed:
+    if (node_pages_selected)
+        free(node_pages_selected);
+    return rc;
+}
+
+static int xc_select_domain_dontcare(xc_interface *xch,
+    xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout)
+{
+    uint64_t *node_pages_selected = 0;
+    int rc;
+
+    DBGPRINTF("%s: Called for VM %d\n", __FUNCTION__, dom_layout->domid);
+    if ((rc = xc_select_domain_prep(xch, phys_layout, dom_layout)))
+        return -1;
+
+    if (!(node_pages_selected = 
+                (uint64_t *)calloc(XC_MAX_NODES, sizeof(uint64_t))))
+    {
+        rc = -1;
+        ERROR("%s: node_pages allocation failed\n", __FUNCTION__);
+    	goto failed;
+    }
+	if ((rc = xc_select_max_fit_nodes(xch, phys_layout, dom_layout->nr_pages, 
+                                    node_pages_selected)) < 0)
+    {
+        ERROR("%s: Not enough memory for CONFINE (Had to balloon more ?)\n",
+                                                            __FUNCTION__);
+    	goto failed;
+    }
+
+    dom_layout->type = XEN_DOM_NUMA_DONTCARE;
+    dom_layout->nr_vnodes = rc;
+    rc = xc_setup_domain_vnodes(xch, phys_layout, dom_layout, 
+                                                    node_pages_selected);
+    if (!rc)
+        DBGPRINTF("%s: Selected DONTCARE for VM %d\n", 
+                                    __FUNCTION__, dom_layout->domid);
+failed:
+    if (node_pages_selected)
+        free(node_pages_selected);
+    return rc;
+}
+
+#define XC_DOM_IS_NUMA_GUEST(n) (0)
+
+static int xc_select_domain_auto(xc_interface *xch,
+    xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout)
+{
+	int i;
+
+    /* Attempt to confine the VM */
+    DBGPRINTF("%s: Selecting allocation strategy for (VM %d)\n", 
+                                    __FUNCTION__, dom_layout->domid);
+
+    dom_layout->nr_vnodes = 1;
+	if (!xc_select_domain_confine(xch, phys_layout, dom_layout))
+		return 0;
+
+    if (!XC_DOM_IS_NUMA_GUEST(dom_layout))
+        DBGPRINTF("%s: Image doesn't support numa (VM %d)\n", 
+                                    __FUNCTION__, dom_layout->domid);
+    else
+	{
+    	/* Attempt to split the VM */
+    	for (i = 2; i <= phys_layout->nr_nodes; i<<=1)
+        {
+            dom_layout->nr_vnodes = i;
+			if (!xc_select_domain_split(xch, phys_layout, dom_layout))
+				return 0;
+        }
+	}
+
+  	/* Attempt to stripe the VM */
+   	for (i = 2; i <= phys_layout->nr_nodes; i++)
+    {
+        dom_layout->nr_vnodes = i;
+		if (!xc_select_domain_stripe(xch, phys_layout, dom_layout))
+			return 0;
+    }
+
+	if (!xc_select_domain_dontcare(xch, phys_layout, dom_layout))
+		return 0;
+
+    ERROR("%s: Failed to allocate memory for the VM (Had to balloon more ?)\n",
+                                                            __FUNCTION__);
+    return -1;
+}
+
+int xc_setup_numa_domain(xc_interface *xch, xc_domain_numa_layout_t *dom_layout)
+{
+    int rc;
+    xc_machine_numa_layout_t *phys_layout;
+
+    DBGPRINTF("%s: called (mem_strategy:%d)\n",
+                                    __FUNCTION__, dom_layout->strategy);
+
+    if (!(phys_layout = malloc(sizeof(*phys_layout))))
+    {
+        ERROR( "%s: phys_layout allocation failed\n", __FUNCTION__);
+        return -1;
+    }
+
+    if ((rc = xc_get_machine_numa_layout(xch, phys_layout)))
+    {
+        ERROR( "%s: xc_get_machine_numa_layout failed\n", __FUNCTION__);
+        goto done;
+    }
+
+	switch (dom_layout->strategy)
+	{
+		case XC_DOM_NUMA_AUTO:
+			rc = xc_select_domain_auto(xch, phys_layout, dom_layout);
+			break;
+		case XC_DOM_NUMA_CONFINE:
+            dom_layout->nr_vnodes = 1; /* In case configured bad */
+			rc = xc_select_domain_confine(xch, phys_layout, dom_layout);
+			break;
+		case XC_DOM_NUMA_SPLIT:
+			rc = xc_select_domain_split(xch, phys_layout, dom_layout);
+			break;
+		case XC_DOM_NUMA_STRIPE:
+			rc = xc_select_domain_stripe(xch, phys_layout, dom_layout);
+			break;
+		default:
+			rc = -1;
+        	ERROR("%s: Unknown memory allocation strategy (%d)\n",
+								__FUNCTION__, dom_layout->strategy);
+	}
+
+	if (rc)
+ 	{
+       	ERROR("%s: xc_select_domain failed for (%d)\n", 
+				__FUNCTION__, dom_layout->strategy);
+       	goto done;
+   	}
+
+    xc_dump_dom_numa_layout(xch, dom_layout);
+done:
+    free(phys_layout);
+    return rc;
+}
+
+static int
+xc_domain_numa_vcpu_setaffinity(xc_interface *xch, uint32_t domid,
+                                int vcpu, struct xenctl_cpumap *cpumap)
+{
+    DECLARE_DOMCTL;
+    int ret = -1;
+
+    domctl.cmd = XEN_DOMCTL_setvcpuaffinity;
+    domctl.domain = (domid_t)domid;
+    domctl.u.vcpuaffinity.vcpu = vcpu;
+    domctl.u.vcpuaffinity.cpumap = *cpumap;
+
+    if ( xc_cpumap_lock_pages(cpumap) != 0 )
+    {
+        PERROR("Could not lock memory for Xen hypercall");
+        goto out;
+    }
+
+    ret = do_domctl(xch, &domctl);
+    xc_cpumap_unlock_pages(cpumap);
+ out:
+    return ret;
+}
+
+static int
+xc_domain_numa_pinvcpus_split(xc_interface *xch,
+                                xc_domain_numa_layout_t *dom_layout,
+                                xc_machine_numa_layout_t *phys_layout)
+{
+    int vnode;
+
+    for (vnode = 0; vnode < dom_layout->nr_vnodes; vnode++)
+    {
+        int vcpu;
+        int mnode = dom_layout->vnode_data[vnode].mnode_id;
+        xc_cpumask_t *node_cpumask =
+                    &phys_layout->node_data[mnode].cpu_mask;
+        xc_cpumask_t *vnode_vcpumask =
+                    &dom_layout->vnode_data[vnode].vcpu_mask;
+        struct xenctl_cpumap node_cpumap, vnode_vcpumap;
+
+        xc_cpumap_from_cpumask(&node_cpumap, node_cpumask);
+        xc_cpumap_from_cpumask(&vnode_vcpumap, vnode_vcpumask);
+        xc_for_each_cpu(vcpu, vnode_vcpumap)
+        {
+            if (xc_domain_numa_vcpu_setaffinity(
+                        xch, dom_layout->domid, vcpu, &node_cpumap)) 
+            {
+                ERROR( "%s:xc_vcpu_setaffinity failed\n", __FUNCTION__);
+                return -1;
+            }
+        }
+    }
+    return 0;
+}
+
+static int
+xc_domain_numa_pinvcpus_stripe(xc_interface *xch,
+                                xc_domain_numa_layout_t *dom_layout,
+                                xc_machine_numa_layout_t *phys_layout)
+{
+    int vnode, vcpu;
+    xc_cpumask_t stripe_cpumask;
+    struct xenctl_cpumap stripe_cpumap;
+
+    xc_cpumap_from_cpumask(&stripe_cpumap, &stripe_cpumask);
+    xc_cpumap_clearall(stripe_cpumap);
+
+    for (vnode = 0; vnode < dom_layout->nr_vnodes; vnode++)
+    {
+        int mnode = dom_layout->vnode_data[vnode].mnode_id;
+        xc_cpumask_t *node_cpumask =
+                    &phys_layout->node_data[mnode].cpu_mask;
+        struct xenctl_cpumap node_cpumap;
+
+        xc_cpumap_from_cpumask(&node_cpumap, node_cpumask);
+        xc_cpumap_or(stripe_cpumap, stripe_cpumap, node_cpumap);
+    }
+
+    for (vcpu = 0; vcpu < dom_layout->nr_vcpus; vcpu++)
+    {
+        if (xc_domain_numa_vcpu_setaffinity(
+                    xch, dom_layout->domid, vcpu, &stripe_cpumap)) 
+        {
+            ERROR( "%s:xc_scpu_getaffinity failed\n", __FUNCTION__);
+            return -1;
+        }
+    }
+    return 0;
+}
+
+int
+xc_domain_numa_pinvcpus(xc_interface *xch, xc_domain_numa_layout_t *dom_layout)
+{
+    int rc;
+
+    xc_machine_numa_layout_t *phys_layout;
+    if (!(phys_layout = malloc(sizeof(*phys_layout))))
+    {
+        ERROR( "%s: layout allocation failed\n", __FUNCTION__);
+        return -1;
+    }
+
+    if ((rc = xc_get_machine_numa_layout(xch, phys_layout)))
+    {
+        ERROR( "%s: xc_get_machine_numa_layout failed\n",
+                                                            __FUNCTION__);
+        goto done;
+    }
+
+    if ((dom_layout->type == XEN_DOM_NUMA_STRIPE) || 
+						(dom_layout->type == XEN_DOM_NUMA_DONTCARE))
+        rc = xc_domain_numa_pinvcpus_stripe(xch, dom_layout, phys_layout);
+    else
+        rc = xc_domain_numa_pinvcpus_split(xch, dom_layout, phys_layout);
+done:
+    free(phys_layout);
+    return rc;
+}
diff --git a/tools/libxc/xc_dom_numa.h b/tools/libxc/xc_dom_numa.h
new file mode 100644
--- /dev/null
+++ b/tools/libxc/xc_dom_numa.h
@@ -0,0 +1,73 @@
+#ifndef __XC_DOM_NUMA_H
+#define __XC_DOM_NUMA_H
+
+#include "xenctrl.h"
+#include <xen/dom_numa.h>
+
+#define XC_CPUMASK_NR_CPUS XEN_MAX_VCPUS
+#define XC_MAX_VNODES 8
+
+#define XC_CPUMASK_BITS_PER_BYTE 8
+#define XC_CPUMASK_BITS_TO_BYTES(bits) \
+    (((bits)+XC_CPUMASK_BITS_PER_BYTE-1)/XC_CPUMASK_BITS_PER_BYTE)
+#define XC_CPUMASK_DECLARE_BITMAP(name,bits) \
+    uint8_t name[XC_CPUMASK_BITS_TO_BYTES(bits)]
+
+struct xc_cpumask{ XC_CPUMASK_DECLARE_BITMAP(bits, XC_CPUMASK_NR_CPUS); };
+typedef struct xc_cpumask xc_cpumask_t;
+
+/* Construct a xenctl_cpumap structure using buffer from the xc_cpumask
+ * structure */
+#define xc_cpumap_from_cpumask(map, mask)               \
+do {                                                    \
+    (map)->nr_cpus = XC_CPUMASK_NR_CPUS;                    \
+    set_xen_guest_handle((map)->bitmap, (mask)->bits);  \
+}while(0)
+
+
+struct xc_vnode_data {
+    uint8_t vnode_id;
+    uint8_t mnode_id;
+    uint32_t nr_pages;
+    xc_cpumask_t vcpu_mask; /* vnode_to_vcpumask */
+};
+typedef struct xc_vnode_data xc_vnode_data_t;
+
+struct xc_domain_numa_layout {
+    uint8_t version;
+    uint8_t type;
+
+    uint8_t nr_vcpus;
+    uint8_t nr_vnodes;
+
+    uint32_t nr_pages;
+    /* Only (nr_vnodes) entries are filled */
+    xc_vnode_data_t vnode_data[XC_MAX_VNODES];
+    /* Only (nr_vnodes*nr_vnodes) entries are filled */
+    uint8_t vnode_distance[XC_MAX_VNODES*XC_MAX_VNODES];
+
+    /* For Internal USE only */
+    uint32_t domid;
+    uint16_t strategy;
+    uint16_t stripe_size;
+};
+typedef struct xc_domain_numa_layout xc_domain_numa_layout_t;
+
+extern xc_domain_numa_layout_t * xc_dom_alloc_numa_layout(xc_interface *xch, 
+        uint32_t domid, uint64_t nr_pages, xc_domain_numa_config_t *config);
+extern void xc_dom_free_numa_layout(xc_interface *xch, 
+                                        xc_domain_numa_layout_t *dom_layout);
+
+extern int 
+xc_setup_numa_domain(xc_interface *xch, xc_domain_numa_layout_t *dom_layout);
+extern int
+xc_domain_numa_pinvcpus(xc_interface *xch, xc_domain_numa_layout_t *dom_layout);
+
+static inline int xc_domain_nr_vnodes(xc_domain_numa_layout_t * dom_layout)
+{
+    if (!dom_layout || (dom_layout->type != XEN_DOM_NUMA_SPLIT))
+        return 0;
+    return dom_layout->nr_vnodes;
+}
+
+#endif