vNUMA : Build domain numa layout for HVMs diff --git a/tools/libxc/ia64/xc_ia64_hvm_build.c b/tools/libxc/ia64/xc_ia64_hvm_build.c --- a/tools/libxc/ia64/xc_ia64_hvm_build.c +++ b/tools/libxc/ia64/xc_ia64_hvm_build.c @@ -1110,6 +1110,7 @@ int xc_hvm_build_target_mem(xc_interface uint32_t domid, int memsize, int target, + xc_domain_numa_config_t *numa_config, const char *image_name) { /* XXX:PoD isn't supported yet */ diff --git a/tools/libxc/xc_hvm_build.c b/tools/libxc/xc_hvm_build.c --- a/tools/libxc/xc_hvm_build.c +++ b/tools/libxc/xc_hvm_build.c @@ -10,6 +10,7 @@ #include "xg_private.h" #include "xc_private.h" +#include "xc_dom_numa.h" #include #include @@ -142,29 +143,16 @@ static long populate_physmap(xc_interfac return xc_memory_op(xch, XENMEM_populate_physmap, &reservation); } -static int setup_guest_memory(xc_interface *xch, uint32_t dom, - unsigned long nr_pages, unsigned long target_pages, - struct elf_binary *elf) +#define INVALID_NODE (~0) +static int __setup_guest_memory(xc_interface *xch, uint32_t dom, + unsigned long nr_pages, unsigned long target_pages, + unsigned long cur_pages, xen_pfn_t *page_array, + int vga_hole, int node, int exact_node) { - xen_pfn_t *page_array = NULL; - unsigned long pod_pages = 0, cur_pages, i; unsigned long stat_normal_pages = 0, stat_2mb_pages = 0, stat_1gb_pages = 0; + unsigned long pod_pages = 0; + unsigned int mem_flags = 0; int pod_mode = 0; - - if ( nr_pages > target_pages ) - pod_mode = 1; - - if ( (page_array = malloc(nr_pages * sizeof(xen_pfn_t))) == NULL ) - { - PERROR("Could not allocate memory."); - goto error_out; - } - - for ( i = 0; i < nr_pages; i++ ) - page_array[i] = i; - for ( i = HVM_BELOW_4G_RAM_END >> PAGE_SHIFT; i < nr_pages; i++ ) - page_array[i] += HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT; - /* * Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000. * @@ -175,13 +163,25 @@ static int setup_guest_memory(xc_interfa * Under 2MB mode, we allocate pages in batches of no more than 8MB to * ensure that we can be preempted and hence dom0 remains responsive. */ - if (populate_physmap(xch, dom, 0xa0, 0, 0, page_array, 0x00) != 0xa0 ) + if (nr_pages > target_pages) { - PERROR("Could not allocate memory."); - goto error_out; + pod_mode = 1; + mem_flags |= XENMEMF_populate_on_demand; } - cur_pages = 0xc0; - stat_normal_pages = 0xc0; + if (node != INVALID_NODE) + mem_flags |= exact_node?XENMEMF_exact_node(node):XENMEMF_node(node); + + if (vga_hole) + { + if (populate_physmap(xch, dom, 0xa0, 0, mem_flags, page_array, 0x00) + != 0xa0) + { + PERROR("Could not allocate memory."); + goto error_out; + } + cur_pages = 0xc0; + stat_normal_pages = 0xc0; + } #define ALIGN_COUNT_TO_MAX_PAGES(count, cur_pages, max_pages) \ do{ \ @@ -193,7 +193,6 @@ do{ (count > max_pages) ) \ count &= ~(max_pages-1); \ }while(0) - while ( nr_pages > cur_pages ) { /* Clip count to maximum 1GB extent. */ @@ -203,42 +202,46 @@ do{ if ( count > SUPERPAGE_1GB_NR_PFNS ) count = SUPERPAGE_1GB_NR_PFNS; - /* Attempt to allocate 1GB super page. Because in each pass we only - * allocate at most 1GB, we don't have to clip super page boundaries. + /* Attempt to allocate 1GB super page. Because in each pass we + * allocate atmost 1GB, we don't have to clip super page boundaries. */ ALIGN_COUNT_TO_MAX_PAGES(count, cur_pages, SUPERPAGE_1GB_NR_PFNS); if ( ((count | cur_pages) & (SUPERPAGE_1GB_NR_PFNS - 1)) == 0 && - /* Check if there exists MMIO hole in the 1GB memory range */ - !check_mmio_hole(cur_pages << PAGE_SHIFT, - SUPERPAGE_1GB_NR_PFNS << PAGE_SHIFT) ) + /* Check if there exists MMIO hole in the 1GB memory range */ + !check_mmio_hole(cur_pages << PAGE_SHIFT, + SUPERPAGE_1GB_NR_PFNS << PAGE_SHIFT) ) { done = populate_physmap(xch, dom, count, SUPERPAGE_1GB_SHIFT, - (pod_mode)?XENMEMF_populate_on_demand:0, - page_array, cur_pages); - stat_1gb_pages += done; - done <<= SUPERPAGE_1GB_SHIFT; - if ( pod_mode && target_pages > cur_pages ) + XENMEMF_exact_node(node), page_array, cur_pages); + if ( done > 0 ) { - int d = target_pages - cur_pages; - pod_pages += ( done < d ) ? done : d; + stat_1gb_pages += done; + done <<= SUPERPAGE_1GB_SHIFT; + if ( pod_mode && target_pages > cur_pages ) + { + int d = target_pages - cur_pages; + pod_pages += ( done < d ) ? done : d; + } + cur_pages += done; + count -= done; } - cur_pages += done; - count -= done; } - if ( count != 0 ) + if ( count == 0 ) + continue; + /* Clip count to maximum 8MB extent. */ + if ( count > SUPERPAGE_2MB_NR_PFNS*4 ) + count = SUPERPAGE_2MB_NR_PFNS*4; + + /* Attempt to allocate superpage extents. */ + ALIGN_COUNT_TO_MAX_PAGES(count, cur_pages, SUPERPAGE_2MB_NR_PFNS); + if ( ((count | cur_pages) & (SUPERPAGE_2MB_NR_PFNS - 1)) == 0 ) { - /* Clip count to maximum 8MB extent. */ - if ( count > SUPERPAGE_2MB_NR_PFNS*4 ) - count = SUPERPAGE_2MB_NR_PFNS*4; - - /* Attempt to allocate superpage extents. */ - ALIGN_COUNT_TO_MAX_PAGES(count, cur_pages, SUPERPAGE_2MB_NR_PFNS); - if ( ((count | cur_pages) & (SUPERPAGE_2MB_NR_PFNS - 1)) == 0 ) + done = populate_physmap(xch, dom, count, SUPERPAGE_2MB_SHIFT, + (pod_mode)?XENMEMF_populate_on_demand:0, + page_array, cur_pages); + if ( done > 0 ) { - done = populate_physmap(xch, dom, count, SUPERPAGE_2MB_SHIFT, - (pod_mode)?XENMEMF_populate_on_demand:0, - page_array, cur_pages); stat_2mb_pages += done; done <<= SUPERPAGE_2MB_SHIFT; if ( pod_mode && target_pages > cur_pages ) @@ -251,49 +254,158 @@ do{ } } + if ( count == 0 ) + continue; /* Fall back to 4kB extents. */ - if ( count != 0 ) + done = populate_physmap(xch, dom, count, 0, 0, + page_array, cur_pages); + if ( done != count ) { - done = populate_physmap(xch, dom, count, 0, 0, - page_array, cur_pages); - if ( done != count ) - { - PERROR("Could not allocate memory for HVM guest."); - goto error_out; - } - stat_normal_pages += count; - cur_pages += count; + PERROR("Could not allocate memory for HVM guest."); if ( pod_mode ) - pod_pages -= count; + break; + goto error_out; } + stat_normal_pages += count; + cur_pages += count; + if ( pod_mode ) + pod_pages -= count; } #undef ALIGN_COUNT_TO_MAX_PAGES - if ( pod_mode ) { if ( xc_domain_memory_set_pod_target(xch, dom, pod_pages, - NULL, NULL, NULL) ) + NULL, NULL, NULL) ) { PERROR("Could not set POD target for HVM guest."); goto error_out; } } - IPRINTF("PHYSICAL MEMORY ALLOCATION:\n" + IPRINTF("PHYSICAL MEMORY ALLOCATION (NODE %d):\n" " 4KB PAGES: 0x%016lx\n" " 2MB PAGES: 0x%016lx\n" " 1GB PAGES: 0x%016lx\n", - stat_normal_pages, stat_2mb_pages, stat_1gb_pages); - - if ( loadelfimage(xch, elf, dom, page_array) ) - goto error_out; - free(page_array); + node, stat_normal_pages, stat_2mb_pages, stat_1gb_pages); return 0; +error_out: + return -1; +} -error_out: +static int setup_guest_numa_stripe(xc_interface *xch, + xc_domain_numa_layout_t *dom_layout, xen_pfn_t *page_array) +{ + int vnode, rc; + unsigned long cur_pages, nr_pages; + /* Make a private copy for stripe iterations */ + xc_domain_numa_layout_t *layout; + if (!(layout = malloc(sizeof(*layout)))) + { + PERROR("%s : Failed malloc.", __FUNCTION__); + return -1; + } + memcpy(layout, dom_layout, sizeof(*layout)); + + for (vnode=0, cur_pages=0, nr_pages=0; + cur_pagesnr_pages && !rc; vnode++) + { + unsigned long allocsz; + xc_vnode_data_t *vnode_data; + while (!layout->vnode_data[vnode].nr_pages) + { + vnode++; + if (vnode >= layout->nr_vnodes) + vnode = 0; + } + vnode_data = &layout->vnode_data[vnode]; + allocsz = layout->stripe_size; + if (allocsz > vnode_data->nr_pages) + allocsz = vnode_data->nr_pages; + + nr_pages = cur_pages + allocsz; + rc = __setup_guest_memory(xch, layout->domid, nr_pages, nr_pages, + cur_pages, page_array, !cur_pages, vnode_data->mnode_id, 1); + vnode_data->nr_pages -= allocsz; + cur_pages = nr_pages; + } + free(layout); + return rc; +} + +static int setup_guest_numa_memory(xc_interface *xch, + xc_domain_numa_layout_t *dom_layout, xen_pfn_t *page_array) +{ + int vnode, rc; + unsigned long cur_pages, nr_pages; + + if ((rc = xc_setup_numa_domain(xch, dom_layout))) + goto setup_done; + + if (dom_layout->type == XEN_DOM_NUMA_STRIPE) + { + rc = setup_guest_numa_stripe(xch, dom_layout, page_array); + goto setup_done; + } + + /* XXX: pod is turned off with NUMA allocation for now */ + for (vnode=0, cur_pages=0, nr_pages=0; + vnodenr_vnodes && !rc; vnode++) + { + xc_vnode_data_t *vnode_data = &dom_layout->vnode_data[vnode]; + + nr_pages = cur_pages + vnode_data->nr_pages; + rc = __setup_guest_memory(xch, dom_layout->domid, nr_pages, nr_pages, + cur_pages, page_array, (vnode == 0), vnode_data->mnode_id, + (dom_layout->type != XEN_DOM_NUMA_DONTCARE)); + cur_pages = nr_pages; + } +setup_done: + if (!rc) + rc = xc_domain_numa_pinvcpus(xch, dom_layout); + return rc; +} + +static int setup_guest_nonnuma_memory(xc_interface *xch, uint32_t domid, + unsigned long nr_pages, unsigned long target_pages, + xen_pfn_t *page_array) +{ + return __setup_guest_memory(xch, domid, nr_pages, target_pages, 0, + page_array, 1, INVALID_NODE, 0); +} + +static int setup_guest_memory(xc_interface *xch, uint32_t dom, + xc_domain_numa_layout_t *dom_layout, + unsigned long nr_pages, unsigned long target_pages, + struct elf_binary *elf) +{ + xen_pfn_t *page_array = NULL; + unsigned long i; + int rc; + + if ( (page_array = malloc(nr_pages * sizeof(xen_pfn_t))) == NULL ) + { + rc = -1; + PERROR("Could not allocate memory."); + goto out; + } + + for ( i = 0; i < nr_pages; i++ ) + page_array[i] = i; + for ( i = HVM_BELOW_4G_RAM_END >> PAGE_SHIFT; i < nr_pages; i++ ) + page_array[i] += HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT; + + if ( dom_layout ) + rc = setup_guest_numa_memory(xch, dom_layout, page_array); + else + rc = setup_guest_nonnuma_memory(xch, dom, + nr_pages, target_pages, page_array); + if ( rc ) + goto out; + rc = loadelfimage(xch, elf, dom, page_array); +out: if ( page_array ) free(page_array); - return -1; + return rc; } static int @@ -352,6 +464,7 @@ error_out: static int setup_guest(xc_interface *xch, uint32_t dom, int memsize, int target, + xc_domain_numa_layout_t *dom_layout, char *image, unsigned long image_size) { unsigned long entry_eip; @@ -390,7 +503,7 @@ static int setup_guest(xc_interface *xch v_start, v_end, elf_uval(&elf, elf.ehdr, e_entry)); - rc = setup_guest_memory(xch, dom, + rc = setup_guest_memory(xch, dom, dom_layout, (unsigned long)memsize << (20 - PAGE_SHIFT), (unsigned long)target << (20 - PAGE_SHIFT), &elf); if ( rc < 0 ) @@ -423,16 +536,26 @@ static int xc_hvm_build_internal(xc_inte uint32_t domid, int memsize, int target, + xc_domain_numa_config_t *numa_config, char *image, unsigned long image_size) { + int rc; + xc_domain_numa_layout_t *dom_layout = 0; + if ( (image == NULL) || (image_size == 0) ) { ERROR("Image required"); return -1; } - - return setup_guest(xch, domid, memsize, target, image, image_size); + if ( numa_config ) + dom_layout = xc_dom_alloc_numa_layout(xch, domid, + (uint64_t)memsize << (20 - PAGE_SHIFT), numa_config); + rc = setup_guest(xch, domid, memsize, target, dom_layout, + image, image_size); + if ( dom_layout ) + xc_dom_free_numa_layout(xch, dom_layout); + return rc; } /* xc_hvm_build: @@ -450,11 +573,9 @@ int xc_hvm_build(xc_interface *xch, if ( (image_name == NULL) || ((image = xc_read_image(xch, image_name, &image_size)) == NULL) ) return -1; - - sts = xc_hvm_build_internal(xch, domid, memsize, memsize, image, image_size); - + sts = xc_hvm_build_internal(xch, domid, memsize, memsize, NULL, + image, image_size); free(image); - return sts; } @@ -468,6 +589,7 @@ int xc_hvm_build_target_mem(xc_interface uint32_t domid, int memsize, int target, + xc_domain_numa_config_t *numa_config, const char *image_name) { char *image; @@ -477,11 +599,9 @@ int xc_hvm_build_target_mem(xc_interface if ( (image_name == NULL) || ((image = xc_read_image(xch, image_name, &image_size)) == NULL) ) return -1; - - sts = xc_hvm_build_internal(xch, domid, memsize, target, image, image_size); - + sts = xc_hvm_build_internal(xch, domid, memsize, target, numa_config, + image, image_size); free(image); - return sts; } @@ -499,29 +619,23 @@ int xc_hvm_build_mem(xc_interface *xch, char *img; /* Validate that there is a kernel buffer */ - if ( (image_buffer == NULL) || (image_size == 0) ) { ERROR("kernel image buffer not present"); return -1; } - img = xc_inflate_buffer(xch, image_buffer, image_size, &img_len); if ( img == NULL ) { ERROR("unable to inflate ram disk buffer"); return -1; } - - sts = xc_hvm_build_internal(xch, domid, memsize, memsize, - img, img_len); - + sts = xc_hvm_build_internal(xch, domid, memsize, memsize, NULL, + img, img_len); /* xc_inflate_buffer may return the original buffer pointer (for for already inflated buffers), so exercise some care in freeing */ - if ( (img != NULL) && (img != image_buffer) ) free(img); - return sts; } diff --git a/tools/libxc/xenguest.h b/tools/libxc/xenguest.h --- a/tools/libxc/xenguest.h +++ b/tools/libxc/xenguest.h @@ -149,6 +149,7 @@ int xc_hvm_build_target_mem(xc_interface uint32_t domid, int memsize, int target, + xc_domain_numa_config_t *numa_config, const char *image_name); int xc_hvm_build_mem(xc_interface *xch, diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c --- a/tools/libxl/libxl_dom.c +++ b/tools/libxl/libxl_dom.c @@ -230,6 +230,7 @@ int build_hvm(libxl_ctx *ctx, uint32_t d domid, (info->max_memkb - info->video_memkb) / 1024, (info->target_memkb - info->video_memkb) / 1024, + &info->numa_config, libxl_abs_path(ctx, (char *)info->kernel.path, libxl_xenfirmwaredir_path())); if (ret) { diff --git a/tools/python/xen/lowlevel/xc/xc.c b/tools/python/xen/lowlevel/xc/xc.c --- a/tools/python/xen/lowlevel/xc/xc.c +++ b/tools/python/xen/lowlevel/xc/xc.c @@ -997,7 +997,7 @@ static PyObject *pyxc_hvm_build(XcObject target = memsize; if ( xc_hvm_build_target_mem(self->xc_handle, dom, memsize, - target, image) != 0 ) + target, NULL, image) != 0 ) return pyxc_error_to_exception(self->xc_handle); #if !defined(__ia64__)