diff -r 3004092c4792 -r aa6ed694499d tools/libxc/xc_dom_core.c --- a/tools/libxc/xc_dom_core.c Thu Apr 01 16:23:27 2010 -0400 +++ b/tools/libxc/xc_dom_core.c Sat Apr 03 02:02:05 2010 -0400 @@ -681,10 +681,10 @@ xc_dom_printf("%s: 0x%" PRIpfn " pages\n", __FUNCTION__, dom->total_pages); - if (xc_setup_domain_numa_layout(dom)) + if (xc_setup_numa_domain(dom)) { /* Ignore the error and proceed as non-numa guest */ - xc_dom_printf("%s: xc_setup_domain_layout failed\n", __FUNCTION__); + xc_dom_printf("%s: xc_setup_numa_domain failed\n", __FUNCTION__); } return 0; diff -r 3004092c4792 -r aa6ed694499d tools/libxc/xc_dom_numa.c --- a/tools/libxc/xc_dom_numa.c Thu Apr 01 16:23:27 2010 -0400 +++ b/tools/libxc/xc_dom_numa.c Sat Apr 03 02:02:05 2010 -0400 @@ -1,4 +1,4 @@ -/* XEN Guest NUMA (memory placement) +/* XEN Guest NUMA support * Author : Dulloor (dulloor@xxxxxxxxxx) */ #include @@ -20,6 +20,16 @@ set_xen_guest_handle((map)->bitmap, (mask)->bits); \ }while(0) +struct xc_machine_numa_layout { + uint64_t memsize; + uint64_t memfree; + + uint32_t max_nodes; + uint32_t node_distance[XC_MAX_NODES*XC_MAX_NODES]; + struct xenmem_node_data node_data[XC_MAX_NODES]; +}; +typedef struct xc_machine_numa_layout xc_machine_numa_layout_t; + /* numa layout structures */ xc_machine_numa_layout_t phys_numa_layout; xc_domain_numa_layout_t pv_numa_layout; @@ -70,8 +80,42 @@ return; } +int __xc_domain_numa_layout(uint32_t cmd, int xc_handle, domid_t domid, + struct xen_domain_numa_layout *pv_layout) +{ + int rc; + struct xenmem_numa_op memop; + + /* Update the domain's numa_layout structure */ + memop.cmd = cmd; + memop.u.dinfo.domid = domid; + memop.u.dinfo.version = XEN_DOM_NUMA_INTERFACE_VERSION; + memop.u.dinfo.bufsize = sizeof(*pv_layout); + set_xen_guest_handle(memop.u.dinfo.buf, pv_layout); + + rc = 0; + if (lock_pages(&memop, sizeof(struct xenmem_numa_op)) || + lock_pages(pv_layout, sizeof(*pv_layout))) + { + rc = -1; + PERROR("Could not lock memory for Xen hypercall"); + goto out; + } + + if ((rc = xc_memory_op(xc_handle, XENMEM_numa_op, &memop))) + { + rc = -1; + xc_dom_printf("%s: XENMEM_machine_numa_layout failed\n", __FUNCTION__); + } + + unlock_pages(&memop, sizeof(struct xenmem_numa_op)); + unlock_pages(pv_layout, sizeof(*pv_layout)); +out: + return rc; +} + static int -xc_read_machine_numa_layout(int xc_handle, xc_machine_numa_layout_t *layout) +xc_get_machine_numa_layout(int xc_handle, xc_machine_numa_layout_t *layout) { int rc, i; struct xenmem_numa_op memop; @@ -114,6 +158,7 @@ unlock: unlock_pages(&memop, sizeof(struct xenmem_numa_op)); + unlock_pages(layout, sizeof(*layout)); out: return rc; } @@ -334,6 +379,25 @@ return 0; } +static int xc_setup_vnode_distances(xc_machine_numa_layout_t *phys_layout, + xc_domain_numa_layout_t *pv_layout) +{ + int vn1, vn2; + + for (vn1=0; vn1max_vnodes; vn1++) + { + int n1 = pv_layout->vnode_data[vn1].mnode_id; + for (vn2=0; vn2max_vnodes; vn2++) + { + int n2 = pv_layout->vnode_data[vn2].mnode_id; + pv_layout->vnode_distance[(vn1*pv_layout->max_vnodes)+vn2] = + phys_layout->node_distance[(n1*phys_layout->max_nodes)+n2]; + + } + } + + return 0; +} /* Policies for node selection need more research/experience. * Also, live migration of the VMs (to other nodes) could provide * periodic load balancing across the nodes. */ @@ -360,6 +424,7 @@ return -1; } + memset(pv_layout, 0, sizeof(*pv_layout)); if (!(pv_layout->max_vcpus = xc_get_max_vcpus(dom->guest_xc, dom->guest_domid))) { @@ -439,8 +504,6 @@ vnode_data->mnode_id = node_data->node_id; pv_layout->max_vnodes++; - /* vnode_data->vcpu_mask is set later when distributing the - * vcpus over vnodes and assigning affinities */ } if (xc_setup_vnode_vcpu_masks(pv_layout)) @@ -448,44 +511,151 @@ xc_dom_printf("%s: xc_setup_vnode_vcpu_masks failed !\n", __FUNCTION__); return -1; } - dom->numa_layout = pv_layout; + + if (xc_setup_vnode_distances(phys_layout, pv_layout)) + { + xc_dom_printf("%s: xc_setup_vnode_distances failed !\n", __FUNCTION__); + return -1; + } + return 0; } -int xc_setup_domain_numa_layout(struct xc_dom_image *dom) +int xc_setup_numa_domain(struct xc_dom_image *dom) { int xc_handle; + int rc; xc_machine_numa_layout_t *phys_layout; xc_domain_numa_layout_t *pv_layout; xc_dom_printf("%s: called\n", __FUNCTION__); xc_handle = dom->guest_xc; - phys_layout = &phys_numa_layout; - if (xc_read_machine_numa_layout(xc_handle, phys_layout)) + if (!(phys_layout = malloc(sizeof(*phys_layout))) || + !(pv_layout = malloc(sizeof(*pv_layout)))) { - xc_dom_printf( "%s: xc_read_machine_numa_layout failed\n", - __FUNCTION__); - return -1; - } - - /* Allocate pv_numa_layout dynamically for VMs */ - pv_layout = &pv_numa_layout; - memset(pv_layout, 0, sizeof(*pv_layout)); - - if (xc_select_domain_vnodes(dom, phys_layout, pv_layout)) - { - xc_dom_printf("%s: xc_select_domain_vnodes failed\n", __FUNCTION__); + xc_dom_printf( "%s: layout allocation failed\n", __FUNCTION__); return -1; } + if ((rc = xc_get_machine_numa_layout(xc_handle, phys_layout))) + { + xc_dom_printf( "%s: xc_get_machine_numa_layout failed\n", + __FUNCTION__); + goto done; + } + + if ((rc = xc_select_domain_vnodes(dom, phys_layout, pv_layout))) + { + xc_dom_printf("%s: xc_select_domain_vnodes failed\n", __FUNCTION__); + free(pv_layout); + goto done; + } + + dom->numa_layout = pv_layout; dump_guest_numa_layout(dom, pv_layout); - /* pv_layout is used only temporarily - XENMEM_numa_op to set the - * numa_layout for the domain */ +done: + free(phys_layout); + return rc; +} + +static int +xc_domain_numa_pinvcpus_split(struct xc_dom_image *dom, + struct xen_domain_numa_layout *pv_layout, + xc_machine_numa_layout_t *phys_layout) +{ + int vnode; + + for (vnode = 0; vnode < pv_layout->max_vnodes; vnode++) + { + int vcpu; + int mnode = pv_layout->vnode_data[vnode].mnode_id; + struct xenctl_cpumask *node_cpumask = + &phys_layout->node_data[mnode].cpu_mask; + struct xenctl_cpumask *vnode_vcpumask = + &pv_layout->vnode_data[mnode].vcpu_mask; + struct xenctl_cpumap node_cpumap, vnode_vcpumap; + + xc_cpumap_from_cpumask(&node_cpumap, node_cpumask); + xc_cpumap_from_cpumask(&vnode_vcpumap, vnode_vcpumask); + xc_for_each_cpu(vcpu, vnode_vcpumap) + { + if (xc_vcpu_setaffinity( + dom->guest_xc, dom->guest_domid, vcpu, &node_cpumap)) + { + xc_dom_printf( "%s:xc_vcpu_getaffinity failed\n", __FUNCTION__); + return -1; + } + } + } return 0; } +static int +xc_domain_numa_pinvcpus_striped(struct xc_dom_image *dom, + struct xen_domain_numa_layout *pv_layout, + xc_machine_numa_layout_t *phys_layout) +{ + int vnode, vcpu; + struct xenctl_cpumask stripe_cpumask; + struct xenctl_cpumap stripe_cpumap; + + xc_cpumap_from_cpumask(&stripe_cpumap, &stripe_cpumask); + xc_cpumap_clearall(stripe_cpumap); + + for (vnode = 0; vnode < pv_layout->max_vnodes; vnode++) + { + int mnode = pv_layout->vnode_data[vnode].mnode_id; + struct xenctl_cpumask *node_cpumask = + &phys_layout->node_data[mnode].cpu_mask; + struct xenctl_cpumap node_cpumap; + + xc_cpumap_from_cpumask(&node_cpumap, node_cpumask); + xc_cpumap_or(stripe_cpumap, stripe_cpumap, node_cpumap); + } + + for (vcpu = 0; vcpu < pv_layout->max_vcpus; vcpu++) + { + if (xc_vcpu_setaffinity( + dom->guest_xc, dom->guest_domid, vcpu, &stripe_cpumap)) + { + xc_dom_printf( "%s:xc_vcpu_getaffinity failed\n", __FUNCTION__); + return -1; + } + } + return 0; +} + +int xc_domain_numa_pinvcpus(struct xc_dom_image *dom, + struct xen_domain_numa_layout *pv_layout) +{ + int rc; + + xc_machine_numa_layout_t *phys_layout; + if (!(phys_layout = malloc(sizeof(*phys_layout)))) + { + xc_dom_printf( "%s: layout allocation failed\n", __FUNCTION__); + return -1; + } + + if ((rc = xc_get_machine_numa_layout(dom->guest_xc, phys_layout))) + { + xc_dom_printf( "%s: xc_get_machine_numa_layout failed\n", + __FUNCTION__); + goto done; + } + + if (pv_layout->type == XEN_DOM_NUMA_STRIPED) + rc = xc_domain_numa_pinvcpus_striped(dom, pv_layout, phys_layout); + else + rc = xc_domain_numa_pinvcpus_split(dom, pv_layout, phys_layout); +done: + free(phys_layout); + return rc; +} + + /*************************************************************************/ /* node lookup for mfns */ #define NUMA_NO_NODE 0xFF @@ -501,10 +671,7 @@ addr = pfn_to_paddr(mfn); if((addr >> memnode->shift) >= memnode->mapsize) - { - xc_dom_printf("(addr(%lx) >> memnode_shift) >= NODEMAPSIZE", addr); - return 0; - } + return -1; get_xen_guest_handle(memnode_map, memnode->map); nid = memnode_map[addr >> memnode->shift]; return nid; @@ -572,4 +739,3 @@ out: return rc; } - diff -r 3004092c4792 -r aa6ed694499d tools/libxc/xc_dom_numa.h --- a/tools/libxc/xc_dom_numa.h Thu Apr 01 16:23:27 2010 -0400 +++ b/tools/libxc/xc_dom_numa.h Sat Apr 03 02:02:05 2010 -0400 @@ -3,19 +3,31 @@ #define XC_MAX_NODES 8 -struct xc_machine_numa_layout { - uint64_t memsize; - uint64_t memfree; - - uint32_t max_nodes; - uint32_t node_distance[XC_MAX_NODES*XC_MAX_NODES]; - struct xenmem_node_data node_data[XC_MAX_NODES]; -}; -typedef struct xc_machine_numa_layout xc_machine_numa_layout_t; typedef struct xen_domain_numa_layout xc_domain_numa_layout_t; -extern int xc_setup_domain_numa_layout(struct xc_dom_image *dom); +extern int xc_setup_numa_domain(struct xc_dom_image *dom); +extern int xc_domain_numa_pinvcpus(struct xc_dom_image *dom, + struct xen_domain_numa_layout *pv_layout); + extern int xc_domain_nodemem_distribution(struct xc_dom_image *dom); +extern int __xc_domain_numa_layout(uint32_t cmd, int xc_handle, domid_t domid, + struct xen_domain_numa_layout *pv_layout); + + +static inline int xc_get_domain_numa_layout(struct xc_dom_image *dom, + struct xen_domain_numa_layout *layout) +{ + return __xc_domain_numa_layout( + XENMEM_get_domain_numa_layout, dom->guest_xc, dom->guest_domid, layout); +} + +static inline int xc_set_domain_numa_layout(struct xc_dom_image *dom, + struct xen_domain_numa_layout *layout) +{ + return __xc_domain_numa_layout( + XENMEM_set_domain_numa_layout, dom->guest_xc, dom->guest_domid, layout); +} + extern void dump_guest_numa_layout(struct xc_dom_image *dom, xc_domain_numa_layout_t *layout); #endif diff -r 3004092c4792 -r aa6ed694499d tools/libxc/xc_dom_x86.c --- a/tools/libxc/xc_dom_x86.c Thu Apr 01 16:23:27 2010 -0400 +++ b/tools/libxc/xc_dom_x86.c Sat Apr 03 02:02:05 2010 -0400 @@ -19,10 +19,12 @@ #include #include #include +#include #include "xg_private.h" #include "xc_dom.h" #include "xenctrl.h" +#include "xc_dom_numa.h" /* ------------------------------------------------------------------------ */ @@ -475,6 +477,17 @@ start_info->console.domU.mfn = xc_dom_p2m_guest(dom, dom->console_pfn); start_info->console.domU.evtchn = dom->console_evtchn; + if ( dom->numa_layout ) + { + start_info->flags |= SIF_NUMA_DOMAIN; + memset(dom->numa_layout, 0, sizeof(*dom->numa_layout)); + xc_dom_printf("%s: verifying domain numa_layout\n", __FUNCTION__); + xc_get_domain_numa_layout(dom, dom->numa_layout); + dump_guest_numa_layout(dom, dom->numa_layout); + free(dom->numa_layout); + dom->numa_layout = 0; + } + if ( dom->ramdisk_blob ) { start_info->mod_start = dom->ramdisk_seg.vstart; @@ -648,6 +661,7 @@ { "xen-3.0-x86_32p", 32 }, { "xen-3.0-x86_64", 64 }, }; + DECLARE_DOMCTL; int i,rc; @@ -694,11 +708,141 @@ return rc; } +extern void dump_guest_numa_layout(struct xc_dom_image *dom, + xc_domain_numa_layout_t *layout); +static int arch_setup_numa_striped(struct xc_dom_image *dom) +{ + int rc, vnode; + xen_pfn_t allocsz, i; + + struct xen_domain_numa_layout *dlayout = + malloc(sizeof(struct xen_domain_numa_layout)); + + memcpy(dlayout, dom->numa_layout, sizeof(*dlayout)); + + xc_dom_printf("%s: Striping memory in 4MB chunks\n", __FUNCTION__); + dump_guest_numa_layout(dom, dlayout); + /* allocate guest memory */ + for ( i = rc = allocsz = vnode = 0; + (i < dom->total_pages) && !rc; i += allocsz, vnode++ ) + { + int mnode; + while (!dlayout->vnode_data[vnode].nr_pages) + { + vnode++; + if (vnode >= dlayout->max_vnodes) + vnode = 0; + } + /* Allocate in 4MB stripes */ + allocsz = dom->total_pages - i; + if ( allocsz > 1024 ) + allocsz = 1024; + if (allocsz > dlayout->vnode_data[vnode].nr_pages) + allocsz = dlayout->vnode_data[vnode].nr_pages; + + dlayout->vnode_data[vnode].nr_pages -= allocsz; + mnode = dlayout->vnode_data[vnode].mnode_id; + + rc = xc_domain_memory_populate_physmap( + dom->guest_xc, dom->guest_domid, allocsz, + 0, XENMEMF_exact_node(mnode), + &dom->p2m_host[i]); + } + + free(dlayout); + xc_dom_printf("\n"); + if (rc) + xc_dom_printf("%s: Guest NUMA memory allocation failed \n", + __FUNCTION__); + return rc; +} + +static int arch_setup_numa_split(struct xc_dom_image *dom) +{ + int rc, vnode; + xen_pfn_t allocsz, i; + + struct xen_domain_numa_layout *dlayout; + xen_pfn_t prev_node_limit, cur_node_limit; + + dlayout = dom->numa_layout; + for (vnode = rc = prev_node_limit = cur_node_limit = 0; + (vnode < dlayout->max_vnodes) && !rc; vnode++) + { + struct xen_vnode_data *vnode_data = &dlayout->vnode_data[vnode]; + cur_node_limit = prev_node_limit + vnode_data->nr_pages; + /* allocate guest memory */ + xc_dom_printf("%s: Guest NUMA - node(%d):pages(%lu)\n", + __FUNCTION__, vnode, + (unsigned long)(cur_node_limit - prev_node_limit)); + + for ( i = allocsz = prev_node_limit; + (i < cur_node_limit) && !rc; i += allocsz ) + { + allocsz = cur_node_limit - i; + if ( allocsz > 1024*1024 ) + allocsz = 1024*1024; + rc = xc_domain_memory_populate_physmap( + dom->guest_xc, dom->guest_domid, allocsz, + 0, XENMEMF_exact_node(vnode_data->mnode_id), + &dom->p2m_host[i]); + } + prev_node_limit = cur_node_limit; + } + + if (rc) + xc_dom_printf("%s: Guest NUMA memory allocation failed \n", + __FUNCTION__); + + return rc; +} + +static inline int arch_setup_numa_meminit(struct xc_dom_image *dom) +{ + struct xen_domain_numa_layout *numa_layout; + xc_dom_printf("%s: x86_64 Guest NUMA mem setup - total_pages(%lu)\n", + __FUNCTION__, (unsigned long)dom->total_pages); + + numa_layout = dom->numa_layout; + if (numa_layout->type == XEN_DOM_NUMA_STRIPED) + return arch_setup_numa_striped(dom); + + return arch_setup_numa_split(dom); +} + +static inline int arch_setup_numa_domain(struct xc_dom_image *dom) +{ + int rc; + + if ((rc = arch_setup_numa_meminit(dom))) + { + xc_dom_printf("%s: arch_setup_numa_meminit failed\n", __FUNCTION__); + return rc; + } +#if 0 + xc_domain_nodemem_distribution(dom); +#endif + if ((rc = xc_domain_numa_pinvcpus(dom, dom->numa_layout))) + { + xc_dom_printf("%s: arch_setup_numa_pinvcpus failed\n", __FUNCTION__); + return rc; + } + if ((rc = xc_set_domain_numa_layout(dom, dom->numa_layout))) + { + xc_dom_printf("%s: arch_setup_numa_meminit failed\n", __FUNCTION__); + return rc; + } + + return rc; +} + int arch_setup_meminit(struct xc_dom_image *dom) { int rc; xen_pfn_t pfn, allocsz, i, j, mfn; + xc_dom_printf("%s: x86 mem setup\n", __FUNCTION__); + rc = x86_compat(dom->guest_xc, dom->guest_domid, dom->guest_type); if ( rc ) return rc; @@ -739,11 +883,21 @@ /* setup initial p2m */ for ( pfn = 0; pfn < dom->total_pages; pfn++ ) dom->p2m_host[pfn] = pfn; - + + if (dom->numa_layout) + { + if (!(rc=arch_setup_numa_domain(dom))) + { + xc_dom_printf("%s: arch_numa_meminit success\n", __FUNCTION__); + return rc; + } + xc_dom_printf("%s: arch_numa_meminit failed - reverting to \ + default memory allocation scheme\n", __FUNCTION__); + free(dom->numa_layout); + dom->numa_layout = 0; + } /* allocate guest memory */ - for ( i = rc = allocsz = 0; - (i < dom->total_pages) && !rc; - i += allocsz ) + for ( i=rc=allocsz=0; (itotal_pages) && !rc; i+=allocsz ) { allocsz = dom->total_pages - i; if ( allocsz > 1024*1024 ) @@ -753,7 +907,6 @@ 0, 0, &dom->p2m_host[i]); } } - return rc; } diff -r 3004092c4792 -r aa6ed694499d xen/arch/x86/mm.c --- a/xen/arch/x86/mm.c Thu Apr 01 16:23:27 2010 -0400 +++ b/xen/arch/x86/mm.c Sat Apr 03 02:02:05 2010 -0400 @@ -4615,9 +4615,13 @@ return -EFAULT; } } - break; - } - + printk("XENMEM_memnode_map : %u, %d, %lu\n", + map->bufsize, memnode_shift, memnodemapsize); + + break; + } + + case XENMEM_get_domain_numa_layout: case XENMEM_set_domain_numa_layout: { struct xenmem_domain_numa_layout *layout; diff -r 3004092c4792 -r aa6ed694499d xen/include/public/memory.h --- a/xen/include/public/memory.h Thu Apr 01 16:23:27 2010 -0400 +++ b/xen/include/public/memory.h Sat Apr 03 02:02:05 2010 -0400 @@ -271,7 +271,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_foreign_memory_map_t); -/* Returns numa related info */ +/* xen guest numa operations */ #define XENMEM_numa_op 15 #define XEN_DOM_NUMA_INTERFACE_VERSION 0x00000001 @@ -353,6 +353,7 @@ struct xenmem_domain_numa_layout { domid_t domid; + uint32_t version; uint32_t bufsize; XEN_GUEST_HANDLE(void) buf; diff -r 3004092c4792 -r aa6ed694499d xen/include/public/xen.h --- a/xen/include/public/xen.h Thu Apr 01 16:23:27 2010 -0400 +++ b/xen/include/public/xen.h Sat Apr 03 02:02:05 2010 -0400 @@ -590,6 +590,7 @@ #define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */ #define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ #define SIF_MULTIBOOT_MOD (1<<2) /* Is mod_start a multiboot module? */ +#define SIF_NUMA_DOMAIN (1<<3) /* Is the domain NUMA aware ? */ #define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */ /*