[Xen-devel] [PATCH 3/4] hvm: NUMA guest: allocate memory and pin

This patch introduces a new config file option called guestnodes.
Depending on the specified number (which can be 0 (the default) to
return to current behavior) a set of suitable nodes (which have enough
memory and are the least used ones) is selected and memory allocation is
split evenly across these host nodes. CPU affinity is set accordingly.

Signed-off-by: Andre Przywara <andre.przywara@xxxxxxx>

Regards,
Andre

--
Andre Przywara
AMD-Operating System Research Center (OSRC), Dresden, Germany
Tel: +49 351 277-84917
----to satisfy European Law for business letters:
AMD Saxony Limited Liability Company & Co. KG,
Wilschdorfer Landstr. 101, 01109 Dresden, Germany
Register Court Dresden: HRA 4896, General Partner authorized
to represent: AMD Saxony LLC (Wilmington, Delaware, US)
General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy

# HG changeset patch
# User Andre Przywara <andre.przywara@xxxxxxx>
# Date 1215083831 -7200
# Node ID b84c5f2fe83bd7c94ed956ba412689e614177f5c
# Parent  a0dccef499b005ba13eb70bf6cac856af44a10a0
make guest memory allocation NUMA aware

diff -r a0dccef499b0 -r b84c5f2fe83b tools/libxc/xc_hvm_build.c
--- a/tools/libxc/xc_hvm_build.c        Thu Jul 03 13:04:01 2008 +0200
+++ b/tools/libxc/xc_hvm_build.c        Thu Jul 03 13:17:11 2008 +0200
@@ -18,6 +18,8 @@
 #include "xc_e820.h"
 
 #include <xen/libelf.h>
+
+#include <asm/bitops.h>
 
 #define SUPERPAGE_PFN_SHIFT  9
 #define SUPERPAGE_NR_PFNS    (1UL << SUPERPAGE_PFN_SHIFT)
@@ -155,8 +157,171 @@
     return rc;
 }
 
+static int hweight_long (unsigned long value)
+{
+int ret=0;
+
+    while (value>0)
+    {
+        if (value&1) ++ret;
+        value>>=1;
+    }
+    return ret;
+}
+
+static int get_nodemasks (int xc_handle, uint64_t **nodemasks)
+{
+#define MAX_CPU_ID 255
+    xc_physinfo_t physinfo;
+    xc_cpu_to_node_t *cpumap;
+    int nrcpus, i;
+
+    cpumap=(xc_cpu_to_node_t *)malloc(sizeof(xc_cpu_to_node_t)*MAX_CPU_ID);
+    set_xen_guest_handle(physinfo.cpu_to_node, cpumap);
+
+    xc_physinfo (xc_handle,&physinfo);
+    nrcpus = physinfo.threads_per_core * physinfo.cores_per_socket *
+             physinfo.nr_nodes;
+
+    *nodemasks=malloc(sizeof(uint64_t)*physinfo.nr_nodes);
+    memset (*nodemasks,0,sizeof(uint64_t)*physinfo.nr_nodes);
+    for ( i = 0; i < nrcpus; i++ )
+    {
+        (*nodemasks)[cpumap[i]] |= 1 << i;
+    }
+    return nrcpus;
+}
+
+/* Distribute the VCPUs to the given NUMA nodes.
+ * Use xc_vcpu_setaffinity to pin physical CPUs to the VCPUs.
+ */
+static int setup_numa_affinity (int xc_handle, uint32_t dom,
+                                unsigned long nodemask)
+{
+    uint64_t *nodemasks, usemask;
+
+    int nrcpus, i;
+    xc_dominfo_t dominfo;
+    int nrnodes,curnode,vcpusleft;
+
+    nrnodes = hweight_long (nodemask);
+
+    nrcpus = get_nodemasks (xc_handle, &nodemasks);
+
+    if (xc_domain_getinfo (xc_handle, dom, 1, &dominfo) != 1)
+    {
+        ERROR("Unable to get platform info.");
+        return -1;
+    }
+    curnode = -1;
+    vcpusleft = 0;
+    for ( i = 0; i <= dominfo.max_vcpu_id; i++ )
+    {
+        if ( vcpusleft == 0 )
+        {
+            vcpusleft = ( dominfo.max_vcpu_id + 1 ) / nrnodes;
+            if ( ++curnode < ( ( dominfo.max_vcpu_id + 1 ) % nrnodes ) )
+                vcpusleft++;
+            usemask = nodemasks[__ffs(nodemask)];
+                   nodemask &= ~(1ULL<<__ffs(nodemask));
+        }
+        xc_vcpu_setaffinity (xc_handle, dom, i, usemask);
+        vcpusleft--;
+    }
+
+    return 0;
+}
+
+static int populate_on_node ( int xc_handle, uint32_t dom,
+                              unsigned long *cur_pages,
+                              unsigned long nr_pages,
+                              int node, xen_pfn_t* page_array)
+{
+int rc=0;
+unsigned long i;
+
+    while ( (rc == 0) && (nr_pages > 0 ) )
+    {
+        /* Clip count to maximum 8MB extent. */
+        unsigned long count = nr_pages;
+        if ( count > 2048 )
+            count = 2048;
+
+        /* Clip partial superpage extents to superpage boundaries. */
+        if ( ((*cur_pages & (SUPERPAGE_NR_PFNS-1)) != 0) &&
+             (count > (-*cur_pages & (SUPERPAGE_NR_PFNS-1))) )
+            count = -*cur_pages & (SUPERPAGE_NR_PFNS-1); /* clip s.p. tail */
+        else if ( ((count & (SUPERPAGE_NR_PFNS-1)) != 0) &&
+                  (count > SUPERPAGE_NR_PFNS) )
+            count &= ~(SUPERPAGE_NR_PFNS - 1); /* clip non-s.p. tail */
+
+        /* Attempt to allocate superpage extents. */
+        if ( ((count | *cur_pages) & (SUPERPAGE_NR_PFNS - 1)) == 0 )
+        {
+            long done;
+            xen_pfn_t sp_extents[2048 >> SUPERPAGE_PFN_SHIFT];
+            struct xen_memory_reservation sp_req = {
+                .nr_extents   = count >> SUPERPAGE_PFN_SHIFT,
+                .extent_order = SUPERPAGE_PFN_SHIFT,
+                .mem_flags    = XENMEM_set_node(node),
+                .domid        = dom
+            };
+            set_xen_guest_handle(sp_req.extent_start, sp_extents);
+            for ( i = 0; i < sp_req.nr_extents; i++ )
+                sp_extents[i] = 
page_array[*cur_pages+(i<<SUPERPAGE_PFN_SHIFT)];
+            done = xc_memory_op(xc_handle, XENMEM_populate_physmap, &sp_req);
+            if ( done > 0 )
+            {
+                done <<= SUPERPAGE_PFN_SHIFT;
+                *cur_pages += done;
+                count -= done;
+                nr_pages -= done;
+            }
+        }
+
+        /* Fall back to 4kB extents. */
+        if ( count != 0 )
+        {
+            rc = xc_domain_memory_populate_physmap(
+                xc_handle, dom, count, 0, 0, node, &page_array[*cur_pages]);
+            *cur_pages += count;
+            nr_pages -= count;
+        }
+    }
+    return rc;
+}
+
+static int setup_numa_mem ( int xc_handle, uint32_t dom,
+                            unsigned long *cur_pages, unsigned long nr_pages,
+                            unsigned nodemask, xen_pfn_t *page_array)
+{
+    int i, rc;
+    unsigned long cur_node_pages;
+    unsigned long pages_per_node;
+    int numanodes;
+
+    numanodes = hweight_long (nodemask);
+
+    pages_per_node = ((nr_pages+0xFF)&(~0xFFUL))/numanodes;
+
+    for ( i = 0 ; i < numanodes ; i++ )
+    {
+        if ( i == numanodes - 1 )
+            cur_node_pages = nr_pages - i * pages_per_node;
+        else cur_node_pages = pages_per_node;
+        if ( i == 0 ) cur_node_pages -= *cur_pages;
+
+        rc = populate_on_node (xc_handle, dom, cur_pages, cur_node_pages,
+                               __ffs(nodemask), page_array);
+        if ( rc != 0 ) return rc;
+
+        nodemask &= ~(1<<__ffs(nodemask));
+    }
+    return 0;
+}
+
 static int setup_guest(int xc_handle,
-                       uint32_t dom, int memsize,
+                       uint32_t dom, int memsize, unsigned long nodemask,
                        char *image, unsigned long image_size)
 {
     xen_pfn_t *page_array = NULL;
@@ -169,6 +334,7 @@
     struct elf_binary elf;
     uint64_t v_start, v_end;
     int rc;
+    int node;
     xen_capabilities_info_t caps;
 
     /* An HVM guest must be initialised with at least 2MB memory. */
@@ -217,60 +383,30 @@
      * We allocate pages in batches of no more than 8MB to ensure that
      * we can be preempted and hence dom0 remains responsive.
      */
+
+    if ( nodemask == 0 ) node = XENMEM_DEFAULT_NODE;
+        else node = __ffs (nodemask);
+
     rc = xc_domain_memory_populate_physmap(
-        xc_handle, dom, 0xa0, 0, 0, XENMEM_DEFAULT_NODE, &page_array[0x00]);
+        xc_handle, dom, 0xa0, 0, 0, node, &page_array[0x00]);
     cur_pages = 0xc0;
-    while ( (rc == 0) && (nr_pages > cur_pages) )
-    {
-        /* Clip count to maximum 8MB extent. */
-        unsigned long count = nr_pages - cur_pages;
-        if ( count > 2048 )
-            count = 2048;
 
-        /* Clip partial superpage extents to superpage boundaries. */
-        if ( ((cur_pages & (SUPERPAGE_NR_PFNS-1)) != 0) &&
-             (count > (-cur_pages & (SUPERPAGE_NR_PFNS-1))) )
-            count = -cur_pages & (SUPERPAGE_NR_PFNS-1); /* clip s.p. tail */
-        else if ( ((count & (SUPERPAGE_NR_PFNS-1)) != 0) &&
-                  (count > SUPERPAGE_NR_PFNS) )
-            count &= ~(SUPERPAGE_NR_PFNS - 1); /* clip non-s.p. tail */
-
-        /* Attempt to allocate superpage extents. */
-        if ( ((count | cur_pages) & (SUPERPAGE_NR_PFNS - 1)) == 0 )
-        {
-            long done;
-            xen_pfn_t sp_extents[2048 >> SUPERPAGE_PFN_SHIFT];
-            struct xen_memory_reservation sp_req = {
-                .nr_extents   = count >> SUPERPAGE_PFN_SHIFT,
-                .extent_order = SUPERPAGE_PFN_SHIFT,
-                .mem_flags     = XENMEM_set_node(XENMEM_DEFAULT_NODE),
-                .domid        = dom
-            };
-            set_xen_guest_handle(sp_req.extent_start, sp_extents);
-            for ( i = 0; i < sp_req.nr_extents; i++ )
-                sp_extents[i] = page_array[cur_pages+(i<<SUPERPAGE_PFN_SHIFT)];
-            done = xc_memory_op(xc_handle, XENMEM_populate_physmap, &sp_req);
-            if ( done > 0 )
-            {
-                done <<= SUPERPAGE_PFN_SHIFT;
-                cur_pages += done;
-                count -= done;
-            }
-        }
-
-        /* Fall back to 4kB extents. */
-        if ( count != 0 )
-        {
-            rc = xc_domain_memory_populate_physmap(
-                xc_handle, dom, count, 0, 0, XENMEM_DEFAULT_NODE, 
&page_array[cur_pages]);
-            cur_pages += count;
-        }
-    }
+    if ( hweight_long (nodemask) > 1 )
+        rc = setup_numa_mem (xc_handle, dom, &cur_pages, nr_pages,
+                             nodemask, page_array);
+    else
+        rc = populate_on_node (xc_handle, dom, &cur_pages, nr_pages - 
cur_pages,
+                               node, page_array);
 
     if ( rc != 0 )
     {
         PERROR("Could not allocate memory for HVM guest.\n");
         goto error_out;
+    }
+
+    if ( hweight_long (nodemask) > 1 )
+    {
+        setup_numa_affinity (xc_handle, dom, nodemask);
     }
 
     if ( loadelfimage(&elf, xc_handle, dom, page_array) != 0 )
@@ -365,6 +501,7 @@
 static int xc_hvm_build_internal(int xc_handle,
                                  uint32_t domid,
                                  int memsize,
+                                 unsigned long nodemask,
                                  char *image,
                                  unsigned long image_size)
 {
@@ -374,7 +511,7 @@
         return -1;
     }
 
-    return setup_guest(xc_handle, domid, memsize, image, image_size);
+    return setup_guest(xc_handle, domid, memsize, nodemask, image, image_size);
 }
 
 static inline int is_loadable_phdr(Elf32_Phdr *phdr)
@@ -389,6 +526,7 @@
 int xc_hvm_build(int xc_handle,
                  uint32_t domid,
                  int memsize,
+                 unsigned long nodemask,
                  const char *image_name)
 {
     char *image;
@@ -399,7 +537,8 @@
          ((image = xc_read_image(image_name, &image_size)) == NULL) )
         return -1;
 
-    sts = xc_hvm_build_internal(xc_handle, domid, memsize, image, image_size);
+    sts = xc_hvm_build_internal(xc_handle, domid, memsize, nodemask,
+                                image, image_size);
 
     free(image);
 
@@ -412,6 +551,7 @@
 int xc_hvm_build_mem(int xc_handle,
                      uint32_t domid,
                      int memsize,
+                     unsigned long nodemask,
                      const char *image_buffer,
                      unsigned long image_size)
 {
@@ -434,7 +574,7 @@
         return -1;
     }
 
-    sts = xc_hvm_build_internal(xc_handle, domid, memsize,
+    sts = xc_hvm_build_internal(xc_handle, domid, memsize, nodemask,
                                 img, img_len);
 
     /* xc_inflate_buffer may return the original buffer pointer (for
diff -r a0dccef499b0 -r b84c5f2fe83b tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h    Thu Jul 03 13:04:01 2008 +0200
+++ b/tools/libxc/xenguest.h    Thu Jul 03 13:17:11 2008 +0200
@@ -128,11 +128,13 @@
 int xc_hvm_build(int xc_handle,
                  uint32_t domid,
                  int memsize,
+                 unsigned long nodemask,
                  const char *image_name);
 
 int xc_hvm_build_mem(int xc_handle,
                      uint32_t domid,
                      int memsize,
+                     unsigned long nodemask,
                      const char *image_buffer,
                      unsigned long image_size);
 
diff -r a0dccef499b0 -r b84c5f2fe83b tools/libxc/xg_private.c
--- a/tools/libxc/xg_private.c  Thu Jul 03 13:04:01 2008 +0200
+++ b/tools/libxc/xg_private.c  Thu Jul 03 13:17:11 2008 +0200
@@ -177,6 +177,7 @@
     int xc_hvm_build(int xc_handle,
                      uint32_t domid,
                      int memsize,
+                     unsigned long nodemask,
                      const char *image_name)
 {
     errno = ENOSYS;
diff -r a0dccef499b0 -r b84c5f2fe83b tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Thu Jul 03 13:04:01 2008 +0200
+++ b/tools/python/xen/lowlevel/xc/xc.c Thu Jul 03 13:17:11 2008 +0200
@@ -857,16 +857,17 @@
 #endif
     char *image;
     int memsize, vcpus = 1, acpi = 0, apic = 1;
+    unsigned long nodemask;
 
     static char *kwd_list[] = { "domid",
                                 "memsize", "image", "vcpus", "acpi",
-                                "apic", NULL };
-    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iis|iii", kwd_list,
-                                      &dom, &memsize,
-                                      &image, &vcpus, &acpi, &apic) )
+                                "apic", "nodemask", NULL };
+    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iis|iiil", kwd_list,
+                                      &dom, &memsize, &image,
+                                      &vcpus, &acpi, &apic, &nodemask) )
         return NULL;
 
-    if ( xc_hvm_build(self->xc_handle, dom, memsize, image) != 0 )
+    if ( xc_hvm_build(self->xc_handle, dom, memsize, nodemask, image) != 0 )
         return pyxc_error_to_exception();
 
 #if !defined(__ia64__)
diff -r a0dccef499b0 -r b84c5f2fe83b tools/python/xen/xend/XendConfig.py
--- a/tools/python/xen/xend/XendConfig.py       Thu Jul 03 13:04:01 2008 +0200
+++ b/tools/python/xen/xend/XendConfig.py       Thu Jul 03 13:17:11 2008 +0200
@@ -162,6 +162,7 @@
     'vhpt': int,
     'guest_os_type': str,
     'hap': int,
+    'guestnodes': int,
 }
 
 # Xen API console 'other_config' keys.
@@ -374,6 +375,7 @@
             'other_config': {},
             'platform': {},
             'target': 0,
+            'guestnodes': 0,
         }
         
         return defaults
@@ -569,7 +571,10 @@
             cfg["memory"] = int(sxp.child_value(sxp_cfg, "memory"))
         if sxp.child_value(sxp_cfg, "maxmem") != None:
             cfg["maxmem"] = int(sxp.child_value(sxp_cfg, "maxmem"))
-            
+
+        if sxp.child_value(sxp_cfg, "guestnodes") != None:
+            cfg["guestnodes"] = int(sxp.child_value(sxp_cfg, "guestnodes"))
+
         # Convert scheduling parameters to vcpus_params
         if 'vcpus_params' not in cfg:
             cfg['vcpus_params'] = {}
diff -r a0dccef499b0 -r b84c5f2fe83b tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py   Thu Jul 03 13:04:01 2008 +0200
+++ b/tools/python/xen/xend/XendDomainInfo.py   Thu Jul 03 13:17:11 2008 +0200
@@ -2053,7 +2053,7 @@
                     if self.info['cpus'][v]:
                         xc.vcpu_setaffinity(self.domid, v, 
self.info['cpus'][v])
             else:
-                def find_relaxed_node(node_list):
+                def find_relaxed_node(node_list, numnodes):
                     import sys
                     nr_nodes = info['nr_nodes']
                     if node_list is None:
@@ -2078,21 +2078,36 @@
                             nodeload[i] = int(nodeload[i] * 16 / 
len(info['node_to_cpu'][i]))
                         else:
                             nodeload[i] = sys.maxint
-                    index = nodeload.index( min(nodeload) )    
-                    return index
+
+                    if numnodes == 0:
+                       return nodeload.index( min(nodeload) )
+                    else:
+                        nodemask = 0
+                        for i in range (0,numnodes):
+                            index = min((n, i) for i, n in 
enumerate(nodeload))[1]
+                            nodemask = nodemask | (1 << index)
+                            nodeload[index] = sys.maxint
+                        return nodemask
 
                 info = xc.physinfo()
+                nodemask = 0
                 if info['nr_nodes'] > 1:
                     node_memory_list = info['node_to_memory']
                     needmem = 
self.image.getRequiredAvailableMemory(self.info['memory_dynamic_max']) / 1024
+                    if self.image.guestnodes > 1:
+                       needmem = needmem / self.image.guestnodes
                     candidate_node_list = []
                     for i in range(0, info['nr_nodes']):
                         if node_memory_list[i] >= needmem and 
len(info['node_to_cpu'][i]) > 0:
                             candidate_node_list.append(i)
-                    index = find_relaxed_node(candidate_node_list)
-                    cpumask = info['node_to_cpu'][index]
-                    for v in range(0, self.info['VCPUs_max']):
-                        xc.vcpu_setaffinity(self.domid, v, cpumask)
+                    nodemask = find_relaxed_node(candidate_node_list, 
+                                                 self.image.guestnodes)
+                    if self.image.guestnodes < 1:
+                        cpumask = info['node_to_cpu'][nodemask]
+                        for v in range(0, self.info['VCPUs_max']):
+                            xc.vcpu_setaffinity(self.domid, v, cpumask)
+                    else:
+                        self.image.nodemask = nodemask
 
             # Use architecture- and image-specific calculations to determine
             # the various headrooms necessary, given the raw configured
diff -r a0dccef499b0 -r b84c5f2fe83b tools/python/xen/xend/image.py
--- a/tools/python/xen/xend/image.py    Thu Jul 03 13:04:01 2008 +0200
+++ b/tools/python/xen/xend/image.py    Thu Jul 03 13:17:11 2008 +0200
@@ -127,6 +127,9 @@
             self.cpuid = vmConfig['cpuid'];
         if 'cpuid_check' in vmConfig:
             self.cpuid_check = vmConfig['cpuid_check']
+
+        self.guestnodes = int(vmConfig['platform'].get('guestnodes',0))
+        self.nodemask   = 0
 
     def cleanupBootloading(self):
         if self.bootloader:
@@ -696,6 +699,7 @@
         self.apic = int(vmConfig['platform'].get('apic', 0))
         self.acpi = int(vmConfig['platform'].get('acpi', 0))
         self.guest_os_type = vmConfig['platform'].get('guest_os_type')
+        self.guestnodes = int(vmConfig['platform'].get('guestnodes', 0))
            
 
     # Return a list of cmd line args to the device models based on the
@@ -797,13 +801,16 @@
         log.debug("vcpus          = %d", self.vm.getVCpuCount())
         log.debug("acpi           = %d", self.acpi)
         log.debug("apic           = %d", self.apic)
+        log.debug("guestnodes     = %d", self.guestnodes)
+        log.debug("nodemask       = %d", self.nodemask)
 
         rc = xc.hvm_build(domid          = self.vm.getDomid(),
                           image          = self.loader,
                           memsize        = mem_mb,
                           vcpus          = self.vm.getVCpuCount(),
                           acpi           = self.acpi,
-                          apic           = self.apic)
+                          apic           = self.apic,
+                          nodemask       = self.nodemask)
         rc['notes'] = { 'SUSPEND_CANCEL': 1 }
 
         rc['store_mfn'] = xc.hvm_get_param(self.vm.getDomid(),
diff -r a0dccef499b0 -r b84c5f2fe83b tools/python/xen/xm/create.py
--- a/tools/python/xen/xm/create.py     Thu Jul 03 13:04:01 2008 +0200
+++ b/tools/python/xen/xm/create.py     Thu Jul 03 13:17:11 2008 +0200
@@ -557,6 +557,10 @@
           fn=append_value, default=[],
           use="""Cpuid check description.""")
 
+gopts.var('guestnodes', val="GUESTNODES",
+          fn=set_int, default=0,
+          use="""Number of NUMA nodes to appear in the guest.""")
+
 def err(msg):
     """Print an error to stderr and exit.
     """
@@ -765,7 +769,8 @@
              'vnc', 'vncdisplay', 'vncunused', 'vncconsole', 'vnclisten',
              'sdl', 'display', 'xauthority', 'rtc_timeoffset', 'monitor',
              'acpi', 'apic', 'usb', 'usbdevice', 'keymap', 'pci', 'hpet',
-             'guest_os_type', 'hap', 'opengl', 'cpuid', 'cpuid_check']
+             'guest_os_type', 'hap', 'opengl', 'cpuid', 'cpuid_check',
+             'guestnodes' ]
 
     for a in args:
         if a in vals.__dict__ and vals.__dict__[a] is not None:

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

WARNING - OLD ARCHIVES

xen-devel

[Xen-devel] [PATCH 3/4] hvm: NUMA guest: allocate memory and pin cpus ac