vNUMA : Implement allocation strategies diff --git a/tools/libxc/Makefile b/tools/libxc/Makefile --- a/tools/libxc/Makefile +++ b/tools/libxc/Makefile @@ -28,6 +28,7 @@ CTRL_SRCS-y += xc_mem_event.c CTRL_SRCS-y += xc_mem_paging.c CTRL_SRCS-y += xc_memshr.c CTRL_SRCS-y += xc_cpumap.c +CTRL_SRCS-y += xc_dom_numa.c CTRL_SRCS-y += xtl_core.c CTRL_SRCS-y += xtl_logger_stdio.c CTRL_SRCS-$(CONFIG_X86) += xc_pagetab.c diff --git a/tools/libxc/xc_dom_numa.c b/tools/libxc/xc_dom_numa.c new file mode 100644 --- /dev/null +++ b/tools/libxc/xc_dom_numa.c @@ -0,0 +1,901 @@ +/* XEN Guest NUMA support + * Author : Dulloor (dulloor@xxxxxxxxxx) */ + +#include +#include +#include "xg_private.h" +#include "xc_dom_numa.h" +#include "xc_cpumap.h" + +#ifdef __DOM_NUMA_DEBUG__ +#undef DBGPRINTF +#define DBGPRINTF(_f, _a...) xc_report(xch, xch->error_handler, XTL_INFO,0, _f , ## _a) +#endif + +#define XC_MAX_NODES 16 +struct xc_node_data { + uint32_t node_id; + uint64_t size_pages; + uint64_t free_pages; + xc_cpumask_t cpu_mask; /* node_to_cpumask */ +}; +typedef struct xc_node_data xc_node_data_t; + +struct xc_machine_numa_layout { + uint64_t size_pages; + uint64_t free_pages; + + uint32_t nr_nodes; + + /* Only (nr_nodes*nr_nodes) entries are filled */ + uint32_t node_distance[XC_MAX_NODES*XC_MAX_NODES]; + /* Only (nr_nodes) entries are filled */ + xc_node_data_t node_data[XC_MAX_NODES]; +}; +typedef struct xc_machine_numa_layout xc_machine_numa_layout_t; + +/* XXX: Move all sanity checks to this funtion */ +#define XC_DOM_NUMA_MIN_STRIPE 256 +xc_domain_numa_layout_t * xc_dom_alloc_numa_layout(xc_interface *xch, + uint32_t domid, uint64_t nr_pages, xc_domain_numa_config_t *config) +{ + xc_domain_numa_layout_t *dom_layout; + + if (config->strategy == XC_DOM_NUMA_NONE) + { + IPRINTF("%s: NUMA memory allocation disabled\n", __FUNCTION__); + return 0; + } + if (!(dom_layout = (xc_domain_numa_layout_t *)malloc(sizeof(*dom_layout)))) + { + ERROR("%s: dom_layout allocation failed\n", __FUNCTION__); + return dom_layout; + } + + DBGPRINTF("%s: dom_layout allocated\n", __FUNCTION__); + memset(dom_layout, 0, sizeof(*dom_layout)); + + dom_layout->version = XEN_DOM_NUMA_INTERFACE_VERSION; + dom_layout->nr_pages = nr_pages; + dom_layout->nr_vnodes = config->nr_nodes; + + /* Internal data */ + dom_layout->domid = domid; + dom_layout->strategy = config->strategy; + dom_layout->stripe_size = config->stripe_size; + if (dom_layout->stripe_size && + (dom_layout->stripe_size < XC_DOM_NUMA_MIN_STRIPE)) + { + dom_layout->stripe_size = XC_DOM_NUMA_MIN_STRIPE; + IPRINTF("%s: Min STRIPE size is %d pages\n", + __FUNCTION__, dom_layout->stripe_size); + } + return dom_layout; +} + +void +xc_dom_free_numa_layout(xc_interface *xch, xc_domain_numa_layout_t *dom_layout) +{ + DBGPRINTF("%s: dom_layout freed\n", __FUNCTION__); + free(dom_layout); +} + +#define XC_DUMP_STR_SZ (8192) +static void +xc_dump_dom_numa_layout(xc_interface *xch, xc_domain_numa_layout_t *layout) +{ + unsigned int i, j; + char *xc_dump_str, *dumpstr; + if (!(xc_dump_str = malloc(XC_DUMP_STR_SZ))) + { + DBGPRINTF("%s : dump_str allocation failed", __FUNCTION__); + return; + } + dumpstr = xc_dump_str; + dumpstr += sprintf(dumpstr, + "NUMA-LAYOUT(Dom %d) : vcpus(%u), vnodes(%u)", + layout->domid, layout->nr_vcpus, layout->nr_vnodes); + switch (layout->type) + { + case XEN_DOM_NUMA_CONFINE: + dumpstr += sprintf(dumpstr, ", type(CONFINE)\n"); + break; + case XEN_DOM_NUMA_SPLIT: + dumpstr += sprintf(dumpstr, ", type(SPLIT)\n"); + break; + case XEN_DOM_NUMA_STRIPE: + dumpstr += sprintf(dumpstr, ", type(STRIPE)\n"); + break; + case XEN_DOM_NUMA_DONTCARE: + dumpstr += sprintf(dumpstr, ", type(DONTCARE)\n"); + break; + default: + dumpstr += sprintf(dumpstr, ", type(UNDEFINED)\n"); + } + for (i = 0; i < layout->nr_vnodes; i++) + { + xc_vnode_data_t *vnode_data = &layout->vnode_data[i]; + dumpstr += sprintf(dumpstr, "vnode[%u]:mnode(%u), node_nr_pages(%x)", + vnode_data->vnode_id, vnode_data->mnode_id, + vnode_data->nr_pages); + if (layout->type == XEN_DOM_NUMA_SPLIT) + { + char mapstr[128] = ""; + struct xenctl_cpumap cpumap; + xc_cpumap_from_cpumask(&cpumap, &vnode_data->vcpu_mask); + xc_cpumap_snprintf(mapstr, sizeof(mapstr), cpumap); + dumpstr += sprintf(dumpstr, ", vcpu_mask(%s)", mapstr); + } + dumpstr += sprintf(dumpstr, "\n"); + } + + if (layout->type == XEN_DOM_NUMA_CONFINE) + goto done; + dumpstr += sprintf(dumpstr, "vnode distances :\n"); + for (i = 0; i < layout->nr_vnodes; i++) + dumpstr += sprintf(dumpstr, "\tvnode[%u]", i); + for (i = 0; i < layout->nr_vnodes; i++) + { + dumpstr += sprintf(dumpstr, "\nvnode[%u]", i); + for (j = 0; j < layout->nr_vnodes; j++) + dumpstr += sprintf(dumpstr, "\t%u", + layout->vnode_distance[i*layout->nr_vnodes + j]); + dumpstr += sprintf(dumpstr, "\n"); + } +done: + IPRINTF("%s", xc_dump_str); + free(xc_dump_str); + return; +} + + +static int +xc_get_machine_numa_layout(xc_interface *xch, xc_machine_numa_layout_t *layout) +{ + uint32_t i, nr_nodes, nr_cpus; + xc_numainfo_t ninfo = { 0 }; + uint64_t node_memsize[XC_MAX_NODES]; + uint64_t node_memfree[XC_MAX_NODES]; + xc_topologyinfo_t tinfo = { 0 }; + uint32_t cpu_to_node[XC_CPUMASK_NR_CPUS]; + + memset(layout, 0, sizeof(*layout)); + memset(node_memsize, 0, sizeof(uint64_t)*XC_MAX_NODES); + memset(node_memfree, 0, sizeof(uint64_t)*XC_MAX_NODES); + + set_xen_guest_handle(ninfo.node_to_memsize, node_memsize); + set_xen_guest_handle(ninfo.node_to_memfree, node_memfree); + /* Read directly into layout's structure */ + set_xen_guest_handle(ninfo.node_to_node_distance, layout->node_distance); + ninfo.max_node_index = XC_MAX_NODES-1; + if (xc_numainfo(xch, &ninfo)) + { + ERROR("%s: xc_numainfo failed", __FUNCTION__); + return -1; + } + /* No need to check if a node is invalid, as in that case + * the size would be zero and it would never get selected*/ + nr_nodes = ninfo.max_node_index + 1; + if ( nr_nodes > XC_MAX_NODES ) + nr_nodes = XC_MAX_NODES; + + + set_xen_guest_handle(tinfo.cpu_to_core, NULL); + set_xen_guest_handle(tinfo.cpu_to_socket, NULL); + set_xen_guest_handle(tinfo.cpu_to_node, cpu_to_node); + tinfo.max_cpu_index = XC_CPUMASK_NR_CPUS-1; + + if (xc_topologyinfo(xch, &tinfo)) + { + ERROR("%s: xc_topologyinfo failed", __FUNCTION__); + return -1; + } + + nr_cpus = tinfo.max_cpu_index+1; + if (nr_cpus > XC_CPUMASK_NR_CPUS) + nr_cpus = XC_CPUMASK_NR_CPUS; + + layout->nr_nodes = nr_nodes; + for (i=0; inode_data[i].node_id = i; + size_pages = (node_memsize[i] >> PAGE_SHIFT); + free_pages = (node_memfree[i] >> PAGE_SHIFT); + layout->node_data[i].size_pages = size_pages; + layout->node_data[i].free_pages = free_pages; + layout->size_pages += size_pages; + layout->free_pages += free_pages; + } + + for (i=0; inode_data[(cpu_to_node[i])].cpu_mask); + xc_cpumap_from_cpumask(&cpumap, cpumask); + xc_cpumap_set_cpu(i, cpumap); + } + return 0; +} + +static int +xc_get_max_vcpus(xc_interface *xch, uint32_t domid) +{ + DECLARE_DOMCTL; + domctl.cmd = XEN_DOMCTL_getdomaininfo; + domctl.domain = (domid_t)domid; + return ((do_domctl(xch, &domctl) < 0) + ? 0 : (domctl.u.getdomaininfo.max_vcpu_id+1)); +} + +/* The function makes a (greedy) best fit selection of num_vnodes of + * vnode_size each. The number of pages selected from each node are returned + * in the node_pages_selected array. + * The best_fit ranking is based on the fraction(up to 1024 parts) of node + * memory occupied, if the node is selected. + * Returns 0 on success and 1 if selection fails. */ +/* XXX: Node selection needs more research/experience. */ +static int xc_select_best_fit_nodes( + xc_interface *xch, xc_machine_numa_layout_t *phys_layout, + uint32_t num_vnodes, uint64_t vnode_pages, uint64_t *nodes_pages) +{ + int i, num_nodes_selected; + uint64_t best_fit_rank; + + DBGPRINTF("%s: called\n", __FUNCTION__); +#define INVALID_NODE (~0) +#define NODE_FIT_RANK_SHIFT (10) + best_fit_rank = 0; + num_nodes_selected = 0; + + do { + int selected_node = INVALID_NODE; + for (i=0; inr_nodes; i++) + { + xc_node_data_t *node_data; + uint64_t node_sizepages, node_freepages; + uint64_t node_fit_rank; + + /* Node is already selected */ + if (nodes_pages[i]) + continue; + + node_data = &phys_layout->node_data[i]; + node_sizepages = node_data->size_pages; + node_freepages = node_data->free_pages; + + if (node_freepages < vnode_pages) + continue; + + node_fit_rank = ((node_sizepages-node_freepages-vnode_pages) + << NODE_FIT_RANK_SHIFT) / node_sizepages; + + if (node_fit_rank > best_fit_rank) + selected_node = i; + } + + /* Nodes could not be selected. Bail out ! */ + if (selected_node == INVALID_NODE) + return -1; + + nodes_pages[selected_node] = vnode_pages; + num_nodes_selected++; + } while(num_nodes_selected < num_vnodes); +#undef NODE_FIT_RANK_SHIFT +#undef INVALID_NODE + return 0; +} + +/* Sort the phys nodes in the decreasing order of free node memory */ +static void xc_sort_nodeload(xc_machine_numa_layout_t *phys_layout) +{ + int i, j; + uint32_t nr_nodes; + + nr_nodes = phys_layout->nr_nodes; + + for (i = 0; i < nr_nodes; i++) + { + uint64_t i_node_free = phys_layout->node_data[i].free_pages; + for (j = i+1; j < nr_nodes; j++) + { + uint64_t j_node_free = phys_layout->node_data[j].free_pages; + if (i_node_free > j_node_free) + { + xc_node_data_t tmp_node_data; + tmp_node_data = phys_layout->node_data[i]; + phys_layout->node_data[i] = phys_layout->node_data[j]; + phys_layout->node_data[j] = tmp_node_data; + } + } + } + + return; +} + +/* The function selects the nodes in the increasing order of free node memory, + * and fills them. The physical memory map for such a domain is striped + * across all the selected nodes. + * The phys_layout node_data structures could be sorted inplace. So, we + * should always use node_data->node_id while using the node_distance array. + * Returns the number of nodes selected. */ +static int xc_select_max_fit_nodes( + xc_interface *xch, xc_machine_numa_layout_t *phys_layout, + uint64_t dom_pages, uint64_t *node_pages) +{ + int i; + uint64_t dom_alloc_pages; + + DBGPRINTF("%s: called\n", __FUNCTION__); + xc_sort_nodeload(phys_layout); + + dom_alloc_pages = 0; + for (i=0; inr_nodes; i++) + { + xc_node_data_t *node_data; + uint64_t node_freepages; + + node_data = &phys_layout->node_data[i]; + + /* In max-fit, if we try to pack the nodes too aggressively + * we might fail on any small allocation (from xen node heaps). + * That's why, with DEFAULT, we don't use exact_node flag. */ + node_freepages = node_data->free_pages; + if (!node_freepages) + continue; + + if (node_freepages > (dom_pages-dom_alloc_pages)) + node_freepages = (dom_pages-dom_alloc_pages); + + node_pages[i] = node_freepages; + dom_alloc_pages += node_freepages; + } + if (dom_alloc_pages != dom_pages) + { + ERROR( + "%s: Failed to allocate memory. Maybe had to balloon more\n", + __FUNCTION__); + return -1; + } + return (i+1); +} + +static int xc_setup_vnode_vcpu_masks(xc_domain_numa_layout_t *dom_layout) +{ + int vcpu; + for (vcpu=0; vcpunr_vcpus; vcpu++) + { + struct xenctl_cpumap vcpumap; + xc_cpumask_t *vcpumask; + int vnode = vcpu/(dom_layout->nr_vcpus/dom_layout->nr_vnodes); + + vcpumask = &dom_layout->vnode_data[vnode].vcpu_mask; + xc_cpumap_from_cpumask(&vcpumap, vcpumask); + xc_cpumap_set_cpu(vcpu, vcpumap); + } + return 0; +} + +static int xc_setup_vnode_distances(xc_machine_numa_layout_t *phys_layout, + xc_domain_numa_layout_t *dom_layout) +{ + int vn1, vn2; + for (vn1=0; vn1nr_vnodes; vn1++) + { + int n1 = dom_layout->vnode_data[vn1].mnode_id; + for (vn2=0; vn2nr_vnodes; vn2++) + { + int n2 = dom_layout->vnode_data[vn2].mnode_id; + dom_layout->vnode_distance[(vn1*dom_layout->nr_vnodes)+vn2] = + phys_layout->node_distance[(n1*phys_layout->nr_nodes)+n2]; + + } + } + return 0; +} + +/* We require the vnodes to be aligned to 1GB + * SHIFT values for 4K pages */ +#define XC_VNODE_MIN_SHIFT (XEN_MIN_VNODE_SHIFT-PAGE_SHIFT) +#define XC_VNODE_MIN_SIZE (1UL << XC_VNODE_MIN_SHIFT) +#define XC_VNODE_MIN_MASK ~(XC_VNODE_MIN_SIZE-1) +/* Because we are strict with the alignment, we boost the size + * to account for the pages not seen in physmap (by 16MB for now). */ +#define XC_VNODE_BOOST_SIZE (4096) +#define XC_VCPUS_PER_VNODE (1) +#define XC_POWER_OF_2(x) (((x) & ((x) - 1)) == 0) + +static int xc_setup_domain_vnodes(xc_interface *xch, + xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout, + uint64_t *node_pages_selected) +{ + int i; + uint32_t vnode_id; + + for (i=0, vnode_id=0; inr_nodes; i++) + { + xc_node_data_t *node_data; + xc_vnode_data_t *vnode_data; + + if (!node_pages_selected[i]) + continue; + + node_data = &phys_layout->node_data[i]; + vnode_data = &dom_layout->vnode_data[vnode_id]; + vnode_data->vnode_id = vnode_id; + vnode_data->nr_pages = node_pages_selected[i]; + vnode_data->mnode_id = node_data->node_id; + vnode_id++; + } + if (vnode_id != dom_layout->nr_vnodes) + { + ERROR("%s: Internal Error(vnode count mismatch) (%d/%d) !\n", + __FUNCTION__, vnode_id, dom_layout->nr_vnodes); + return -1; + } + /* vnodes are exposed to the guest only for SPLIT. */ + if (xc_setup_vnode_vcpu_masks(dom_layout) || + (xc_setup_vnode_distances(phys_layout, dom_layout))) + { + ERROR("%s: vnode setup failed !\n", __FUNCTION__); + return -1; + } + + return 0; +} + +static int xc_select_domain_prep(xc_interface *xch, + xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout) +{ + if (!dom_layout->nr_vnodes) + { + ERROR("%s: VM nr_vnodes configured incorrectly !\n", __FUNCTION__); + return -1; + } + + if (dom_layout->nr_pages > phys_layout->free_pages) + { + ERROR( + "%s: Not enough memory for pv (unlikely after balloon checks)\n", + __FUNCTION__); + return -1; + } + + if (!(dom_layout->nr_vcpus = xc_get_max_vcpus(xch, dom_layout->domid))) + { + ERROR("%s: xc_get_max_vcpus failed !\n", __FUNCTION__); + return -1; + } + + if (dom_layout->nr_vcpus > XC_CPUMASK_NR_CPUS) + { + ERROR("%s: Failed - More than %d vcpus!\n", + __FUNCTION__, XC_CPUMASK_NR_CPUS); + return -1; + } + + if (dom_layout->nr_vcpus < dom_layout->nr_vnodes ) + { + ERROR("%s: VM (%d) - more vcpus(%d) than vnodes(%d)!\n", + __FUNCTION__, dom_layout->domid, dom_layout->nr_vcpus, + dom_layout->nr_vnodes); + return -1; + } + + return 0; +} + +static int xc_select_domain_confine(xc_interface *xch, + xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout) +{ + uint64_t *node_pages_selected = 0; + int rc; + + DBGPRINTF("%s: Called for VM %d\n", __FUNCTION__, dom_layout->domid); + if ((rc = xc_select_domain_prep(xch, phys_layout, dom_layout))) + return -1; + + if (!(node_pages_selected = + (uint64_t *)calloc(XC_MAX_NODES, sizeof(uint64_t)))) + { + rc = -1; + ERROR("%s: node_pages allocation failed\n", __FUNCTION__); + goto failed; + } + if ((rc = xc_select_best_fit_nodes(xch, phys_layout, 1, + dom_layout->nr_pages, node_pages_selected))) + { + ERROR("%s: Not enough memory for CONFINE (Had to balloon more ?)\n", + __FUNCTION__); + goto failed; + } + + dom_layout->type = XEN_DOM_NUMA_CONFINE; + rc = xc_setup_domain_vnodes(xch, phys_layout, dom_layout, + node_pages_selected); + if (!rc) + DBGPRINTF("%s: Selected CONFINE for VM %d\n", + __FUNCTION__, dom_layout->domid); +failed: + if (node_pages_selected) + free(node_pages_selected); + return rc; +} + +/* For the numa guests, we construct a symmetrical topology (wrt the + * distribution of vcpus over vnodes). + * We require the numa guests to have (2^n) vcpus and (2^k) vnodes. + * Each vnode is then assigned 2^(n-k) vcpus, where (n>=k). + */ +static int xc_select_domain_split(xc_interface *xch, + xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout) +{ + uint64_t vnode_nr_pages, *node_pages_selected = 0; + int rc; + + DBGPRINTF("%s: Called for VM %d\n", __FUNCTION__, dom_layout->domid); + if ((rc = xc_select_domain_prep(xch, phys_layout, dom_layout))) + return -1; + + if (!XC_POWER_OF_2(dom_layout->nr_vcpus)) + { + ERROR("%s: #vcpus != 2^n (disable numa split)\n", __FUNCTION__); + return -1; + } + if (!XC_POWER_OF_2(dom_layout->nr_vnodes)) + { + ERROR("%s: #vnodes != 2^n (disable numa split)\n", __FUNCTION__); + return -1; + } + if (dom_layout->nr_vcpus < (dom_layout->nr_vnodes*XC_VCPUS_PER_VNODE)) + { + ERROR("%s: Failed - Not enough vcpus (%d on %d)!\n", + __FUNCTION__, dom_layout->nr_vcpus, dom_layout->nr_vnodes); + return -1; + } + + vnode_nr_pages = + (dom_layout->nr_pages+XC_VNODE_BOOST_SIZE)/dom_layout->nr_vnodes; + vnode_nr_pages &= XC_VNODE_MIN_MASK; + if (vnode_nr_pages < XC_VNODE_MIN_SIZE) + { + ERROR("%s: vnode_size(%lx)nr_pages, dom_layout->nr_vnodes); + return -1; + } + dom_layout->nr_pages = vnode_nr_pages*dom_layout->nr_vnodes; + + if (!(node_pages_selected = + (uint64_t *)calloc(XC_MAX_NODES, sizeof(uint64_t)))) + { + rc = -1; + ERROR("%s: node_pages allocation failed\n", __FUNCTION__); + goto failed; + } + if ((rc = xc_select_best_fit_nodes(xch, phys_layout, dom_layout->nr_vnodes, + vnode_nr_pages, node_pages_selected)) != 0) + { + ERROR("%s: Not enough memory for SPLIT (Had to balloon more ?)\n", + __FUNCTION__); + goto failed; + } + + dom_layout->nr_pages = dom_layout->nr_vnodes*vnode_nr_pages; + dom_layout->type = XEN_DOM_NUMA_SPLIT; + if ((rc = xc_setup_domain_vnodes(xch, phys_layout, dom_layout, + node_pages_selected))) + goto failed; + + if ((rc = xc_domain_setmaxmem(xch, dom_layout->domid, + (dom_layout->nr_pages+XC_VNODE_BOOST_SIZE)<<(PAGE_SHIFT-10)))) + goto failed; + + DBGPRINTF("%s: Selected SPLIT for VM %d\n", + __FUNCTION__, dom_layout->domid); +failed: + if (node_pages_selected) + free(node_pages_selected); + return rc; +} + +static int xc_select_domain_stripe(xc_interface *xch, + xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout) +{ + uint64_t vnode_nr_pages, *node_pages_selected = 0; + int rc; + + DBGPRINTF("%s: Called for VM %d\n", __FUNCTION__, dom_layout->domid); + if ((rc = xc_select_domain_prep(xch, phys_layout, dom_layout))) + return -1; + + vnode_nr_pages = dom_layout->nr_pages/dom_layout->nr_vnodes; + + if (!(node_pages_selected = + (uint64_t *)calloc(XC_MAX_NODES, sizeof(uint64_t)))) + { + rc = -1; + ERROR("%s: node_pages allocation failed\n", __FUNCTION__); + goto failed; + } + if ((rc = xc_select_best_fit_nodes(xch, phys_layout, dom_layout->nr_vnodes, + vnode_nr_pages, node_pages_selected)) != 0) + { + ERROR("%s: Not enough memory for STRIPE (Had to balloon more ?)\n", + __FUNCTION__); + goto failed; + } + + dom_layout->nr_pages = dom_layout->nr_vnodes*vnode_nr_pages; + dom_layout->type = XEN_DOM_NUMA_STRIPE; + rc = xc_setup_domain_vnodes(xch, phys_layout, dom_layout, + node_pages_selected); + if (!rc) + DBGPRINTF("%s: Selected STRIPE for VM %d\n", + __FUNCTION__, dom_layout->domid); +failed: + if (node_pages_selected) + free(node_pages_selected); + return rc; +} + +static int xc_select_domain_dontcare(xc_interface *xch, + xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout) +{ + uint64_t *node_pages_selected = 0; + int rc; + + DBGPRINTF("%s: Called for VM %d\n", __FUNCTION__, dom_layout->domid); + if ((rc = xc_select_domain_prep(xch, phys_layout, dom_layout))) + return -1; + + if (!(node_pages_selected = + (uint64_t *)calloc(XC_MAX_NODES, sizeof(uint64_t)))) + { + rc = -1; + ERROR("%s: node_pages allocation failed\n", __FUNCTION__); + goto failed; + } + if ((rc = xc_select_max_fit_nodes(xch, phys_layout, dom_layout->nr_pages, + node_pages_selected)) < 0) + { + ERROR("%s: Not enough memory for CONFINE (Had to balloon more ?)\n", + __FUNCTION__); + goto failed; + } + + dom_layout->type = XEN_DOM_NUMA_DONTCARE; + dom_layout->nr_vnodes = rc; + rc = xc_setup_domain_vnodes(xch, phys_layout, dom_layout, + node_pages_selected); + if (!rc) + DBGPRINTF("%s: Selected DONTCARE for VM %d\n", + __FUNCTION__, dom_layout->domid); +failed: + if (node_pages_selected) + free(node_pages_selected); + return rc; +} + +#define XC_DOM_IS_NUMA_GUEST(n) (0) + +static int xc_select_domain_auto(xc_interface *xch, + xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout) +{ + int i; + + /* Attempt to confine the VM */ + DBGPRINTF("%s: Selecting allocation strategy for (VM %d)\n", + __FUNCTION__, dom_layout->domid); + + dom_layout->nr_vnodes = 1; + if (!xc_select_domain_confine(xch, phys_layout, dom_layout)) + return 0; + + if (!XC_DOM_IS_NUMA_GUEST(dom_layout)) + DBGPRINTF("%s: Image doesn't support numa (VM %d)\n", + __FUNCTION__, dom_layout->domid); + else + { + /* Attempt to split the VM */ + for (i = 2; i <= phys_layout->nr_nodes; i<<=1) + { + dom_layout->nr_vnodes = i; + if (!xc_select_domain_split(xch, phys_layout, dom_layout)) + return 0; + } + } + + /* Attempt to stripe the VM */ + for (i = 2; i <= phys_layout->nr_nodes; i++) + { + dom_layout->nr_vnodes = i; + if (!xc_select_domain_stripe(xch, phys_layout, dom_layout)) + return 0; + } + + if (!xc_select_domain_dontcare(xch, phys_layout, dom_layout)) + return 0; + + ERROR("%s: Failed to allocate memory for the VM (Had to balloon more ?)\n", + __FUNCTION__); + return -1; +} + +int xc_setup_numa_domain(xc_interface *xch, xc_domain_numa_layout_t *dom_layout) +{ + int rc; + xc_machine_numa_layout_t *phys_layout; + + DBGPRINTF("%s: called (mem_strategy:%d)\n", + __FUNCTION__, dom_layout->strategy); + + if (!(phys_layout = malloc(sizeof(*phys_layout)))) + { + ERROR( "%s: phys_layout allocation failed\n", __FUNCTION__); + return -1; + } + + if ((rc = xc_get_machine_numa_layout(xch, phys_layout))) + { + ERROR( "%s: xc_get_machine_numa_layout failed\n", __FUNCTION__); + goto done; + } + + switch (dom_layout->strategy) + { + case XC_DOM_NUMA_AUTO: + rc = xc_select_domain_auto(xch, phys_layout, dom_layout); + break; + case XC_DOM_NUMA_CONFINE: + dom_layout->nr_vnodes = 1; /* In case configured bad */ + rc = xc_select_domain_confine(xch, phys_layout, dom_layout); + break; + case XC_DOM_NUMA_SPLIT: + rc = xc_select_domain_split(xch, phys_layout, dom_layout); + break; + case XC_DOM_NUMA_STRIPE: + rc = xc_select_domain_stripe(xch, phys_layout, dom_layout); + break; + default: + rc = -1; + ERROR("%s: Unknown memory allocation strategy (%d)\n", + __FUNCTION__, dom_layout->strategy); + } + + if (rc) + { + ERROR("%s: xc_select_domain failed for (%d)\n", + __FUNCTION__, dom_layout->strategy); + goto done; + } + + xc_dump_dom_numa_layout(xch, dom_layout); +done: + free(phys_layout); + return rc; +} + +static int +xc_domain_numa_vcpu_setaffinity(xc_interface *xch, uint32_t domid, + int vcpu, struct xenctl_cpumap *cpumap) +{ + DECLARE_DOMCTL; + int ret = -1; + + domctl.cmd = XEN_DOMCTL_setvcpuaffinity; + domctl.domain = (domid_t)domid; + domctl.u.vcpuaffinity.vcpu = vcpu; + domctl.u.vcpuaffinity.cpumap = *cpumap; + + if ( xc_cpumap_lock_pages(cpumap) != 0 ) + { + PERROR("Could not lock memory for Xen hypercall"); + goto out; + } + + ret = do_domctl(xch, &domctl); + xc_cpumap_unlock_pages(cpumap); + out: + return ret; +} + +static int +xc_domain_numa_pinvcpus_split(xc_interface *xch, + xc_domain_numa_layout_t *dom_layout, + xc_machine_numa_layout_t *phys_layout) +{ + int vnode; + + for (vnode = 0; vnode < dom_layout->nr_vnodes; vnode++) + { + int vcpu; + int mnode = dom_layout->vnode_data[vnode].mnode_id; + xc_cpumask_t *node_cpumask = + &phys_layout->node_data[mnode].cpu_mask; + xc_cpumask_t *vnode_vcpumask = + &dom_layout->vnode_data[vnode].vcpu_mask; + struct xenctl_cpumap node_cpumap, vnode_vcpumap; + + xc_cpumap_from_cpumask(&node_cpumap, node_cpumask); + xc_cpumap_from_cpumask(&vnode_vcpumap, vnode_vcpumask); + xc_for_each_cpu(vcpu, vnode_vcpumap) + { + if (xc_domain_numa_vcpu_setaffinity( + xch, dom_layout->domid, vcpu, &node_cpumap)) + { + ERROR( "%s:xc_vcpu_setaffinity failed\n", __FUNCTION__); + return -1; + } + } + } + return 0; +} + +static int +xc_domain_numa_pinvcpus_stripe(xc_interface *xch, + xc_domain_numa_layout_t *dom_layout, + xc_machine_numa_layout_t *phys_layout) +{ + int vnode, vcpu; + xc_cpumask_t stripe_cpumask; + struct xenctl_cpumap stripe_cpumap; + + xc_cpumap_from_cpumask(&stripe_cpumap, &stripe_cpumask); + xc_cpumap_clearall(stripe_cpumap); + + for (vnode = 0; vnode < dom_layout->nr_vnodes; vnode++) + { + int mnode = dom_layout->vnode_data[vnode].mnode_id; + xc_cpumask_t *node_cpumask = + &phys_layout->node_data[mnode].cpu_mask; + struct xenctl_cpumap node_cpumap; + + xc_cpumap_from_cpumask(&node_cpumap, node_cpumask); + xc_cpumap_or(stripe_cpumap, stripe_cpumap, node_cpumap); + } + + for (vcpu = 0; vcpu < dom_layout->nr_vcpus; vcpu++) + { + if (xc_domain_numa_vcpu_setaffinity( + xch, dom_layout->domid, vcpu, &stripe_cpumap)) + { + ERROR( "%s:xc_scpu_getaffinity failed\n", __FUNCTION__); + return -1; + } + } + return 0; +} + +int +xc_domain_numa_pinvcpus(xc_interface *xch, xc_domain_numa_layout_t *dom_layout) +{ + int rc; + + xc_machine_numa_layout_t *phys_layout; + if (!(phys_layout = malloc(sizeof(*phys_layout)))) + { + ERROR( "%s: layout allocation failed\n", __FUNCTION__); + return -1; + } + + if ((rc = xc_get_machine_numa_layout(xch, phys_layout))) + { + ERROR( "%s: xc_get_machine_numa_layout failed\n", + __FUNCTION__); + goto done; + } + + if ((dom_layout->type == XEN_DOM_NUMA_STRIPE) || + (dom_layout->type == XEN_DOM_NUMA_DONTCARE)) + rc = xc_domain_numa_pinvcpus_stripe(xch, dom_layout, phys_layout); + else + rc = xc_domain_numa_pinvcpus_split(xch, dom_layout, phys_layout); +done: + free(phys_layout); + return rc; +} diff --git a/tools/libxc/xc_dom_numa.h b/tools/libxc/xc_dom_numa.h new file mode 100644 --- /dev/null +++ b/tools/libxc/xc_dom_numa.h @@ -0,0 +1,73 @@ +#ifndef __XC_DOM_NUMA_H +#define __XC_DOM_NUMA_H + +#include "xenctrl.h" +#include + +#define XC_CPUMASK_NR_CPUS XEN_MAX_VCPUS +#define XC_MAX_VNODES 8 + +#define XC_CPUMASK_BITS_PER_BYTE 8 +#define XC_CPUMASK_BITS_TO_BYTES(bits) \ + (((bits)+XC_CPUMASK_BITS_PER_BYTE-1)/XC_CPUMASK_BITS_PER_BYTE) +#define XC_CPUMASK_DECLARE_BITMAP(name,bits) \ + uint8_t name[XC_CPUMASK_BITS_TO_BYTES(bits)] + +struct xc_cpumask{ XC_CPUMASK_DECLARE_BITMAP(bits, XC_CPUMASK_NR_CPUS); }; +typedef struct xc_cpumask xc_cpumask_t; + +/* Construct a xenctl_cpumap structure using buffer from the xc_cpumask + * structure */ +#define xc_cpumap_from_cpumask(map, mask) \ +do { \ + (map)->nr_cpus = XC_CPUMASK_NR_CPUS; \ + set_xen_guest_handle((map)->bitmap, (mask)->bits); \ +}while(0) + + +struct xc_vnode_data { + uint8_t vnode_id; + uint8_t mnode_id; + uint32_t nr_pages; + xc_cpumask_t vcpu_mask; /* vnode_to_vcpumask */ +}; +typedef struct xc_vnode_data xc_vnode_data_t; + +struct xc_domain_numa_layout { + uint8_t version; + uint8_t type; + + uint8_t nr_vcpus; + uint8_t nr_vnodes; + + uint32_t nr_pages; + /* Only (nr_vnodes) entries are filled */ + xc_vnode_data_t vnode_data[XC_MAX_VNODES]; + /* Only (nr_vnodes*nr_vnodes) entries are filled */ + uint8_t vnode_distance[XC_MAX_VNODES*XC_MAX_VNODES]; + + /* For Internal USE only */ + uint32_t domid; + uint16_t strategy; + uint16_t stripe_size; +}; +typedef struct xc_domain_numa_layout xc_domain_numa_layout_t; + +extern xc_domain_numa_layout_t * xc_dom_alloc_numa_layout(xc_interface *xch, + uint32_t domid, uint64_t nr_pages, xc_domain_numa_config_t *config); +extern void xc_dom_free_numa_layout(xc_interface *xch, + xc_domain_numa_layout_t *dom_layout); + +extern int +xc_setup_numa_domain(xc_interface *xch, xc_domain_numa_layout_t *dom_layout); +extern int +xc_domain_numa_pinvcpus(xc_interface *xch, xc_domain_numa_layout_t *dom_layout); + +static inline int xc_domain_nr_vnodes(xc_domain_numa_layout_t * dom_layout) +{ + if (!dom_layout || (dom_layout->type != XEN_DOM_NUMA_SPLIT)) + return 0; + return dom_layout->nr_vnodes; +} + +#endif