WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

Re: [Xen-devel] [PATCH v2] xl: add memory allocation logic for numa plat

To: "Zhang, Yang Z" <yang.z.zhang@xxxxxxxxx>
Subject: Re: [Xen-devel] [PATCH v2] xl: add memory allocation logic for numa platform
From: Juergen Gross <juergen.gross@xxxxxxxxxxxxxx>
Date: Tue, 09 Aug 2011 08:10:05 +0200
Cc: "xen-devel@xxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxx>, "'Stefano Stabellini \(stefano.stabellini@xxxxxxxxxxxxx\)'" <stefano.stabellini@xxxxxxxxxxxxx>
Delivery-date: Mon, 08 Aug 2011 23:11:43 -0700
Dkim-signature: v=1; a=rsa-sha256; c=simple/simple; d=ts.fujitsu.com; i=juergen.gross@xxxxxxxxxxxxxx; q=dns/txt; s=s1536b; t=1312870208; x=1344406208; h=message-id:date:from:mime-version:to:cc:subject: references:in-reply-to:content-transfer-encoding; bh=Crhv9Jff+fHtm2MGBz39kWBFOZnhqGfXyyt0RLtO/gw=; b=ldfKzKhci7l5VkF3bDPAmHY3Iu7TlZP5TFW1wBGuERTvbQVVCm/dwzvx 9yp1Mml0T5frfBfQWRdCG0nIVX65roSi3sPa398FuJvO+CcAJ+PpkYFiv dopui0Fr8xMtsewRIcM+attn/H96eTug2fMuPgEIvJzo8crWwFnxvCWb4 msmluidwH2hnCSzqvDxCqViBEyXzyXZp+qNNlZYVUO1LlXEBJjjLsMBwi l4NJFWvMeHzzP00QMerJBuc2pqC4V;
Domainkey-signature: s=s1536a; d=ts.fujitsu.com; c=nofws; q=dns; h=X-SBRSScore:X-IronPort-AV:Received:X-IronPort-AV: Received:Received:Message-ID:Date:From:Organization: User-Agent:MIME-Version:To:CC:Subject:References: In-Reply-To:Content-Type:Content-Transfer-Encoding; b=DG5dIydSeY2VC51pnTGOCScjLKrGrt6myHhjI6tymwWBsRhICFMP5tsw UaboiScQOP1npYCad0ivAXfOmKoynFckg65ie3/xbFFnBW2ozUF9hc1ZN VeoHGAzWBpI+LKHMEXbzF/kFzXFc2L+2sdlI1695Q6hJQ7LoEHlTsWVUv 2KUm9EB0VBVhXM+0Eo1MTIhef6o+dl5HN0l6bLK9oHu+zooUZr/EQ+XFq Q6gwxNhfPqPuWEyDmzxV2hGKomIgK;
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
In-reply-to: <749B9D3DBF0F054390025D9EAFF47F2212D10A330D@xxxxxxxxxxxxxxxxxxxxxxxxxxxx>
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Organization: Fujitsu Technology Solutions
References: <749B9D3DBF0F054390025D9EAFF47F2212D10A330D@xxxxxxxxxxxxxxxxxxxxxxxxxxxx>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
User-agent: Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.18) Gecko/20110626 Iceowl/1.0b2 Icedove/3.1.11
Hi again,

sorry, didn't spot it before: you should use the generic cpumap functions
(libxl_cpumap_alloc, libxl_cpumap_test, libxl_cpumap_set, libxl_cpumap_reset)
and the libxl_cpumap type for the cpumaps.
This will remove the little overkill of using an asm construct for setting
a bit, too.

On 08/09/2011 07:52 AM, Zhang, Yang Z wrote:
Thanks Juergen's comments. Here is the revised patch which add cpupool check.

For numa platform, we need to allocate memory for guest on which guest cpu 
reside.
This patch add this feature for xl. Just use the simple algorithm to select the 
best node.

Signed-off-by: Zhang Yang<yang.z.zhang@xxxxxxxxx>

diff -r 9aa47ef52e4d tools/libxl/libxl.c
--- a/tools/libxl/libxl.c       Mon Jul 04 06:08:05 2011 +0800
+++ b/tools/libxl/libxl.c       Fri Aug 12 13:51:41 2011 +0800
@@ -2259,6 +2259,108 @@
      return ERROR_FAIL;
  }

+static inline void set_bit(int nr, volatile void *addr)
+{
+    asm volatile (
+      "btsl %1,%0"
+       : "=m" (ADDR)
+       : "Ir" (nr), "m" (ADDR) : "memory");

This breaks other architectures (e.g. ia64).

+}
+
+int libxl_get_numainfo(libxl_ctx *ctx, libxl_numainfo_t *numainfo)
+{
+    xc_numainfo_t ninfo = { 0 };
+    libxl_physinfo physinfo = { 0 };
+    libxl_topologyinfo topoinfo;
+    int i,  max_nodes, max_cpus, node;
+    libxl_nodeinfo_t *nodeinfo;
+    DECLARE_HYPERCALL_BUFFER(xc_node_to_memsize_t, node_memsize);
+    DECLARE_HYPERCALL_BUFFER(xc_node_to_memfree_t, node_memfree);
+
+    if (libxl_get_physinfo(ctx,&physinfo))
+        goto out;
+
+    max_cpus = physinfo.max_cpu_id + 1;
+    max_nodes = NUMA_NO_NODE + 1;
+    numainfo->max_cpus = max_cpus;
+
+    numainfo->cpu_to_node = calloc(max_cpus, sizeof (unsigned long));
+    if (numainfo->cpu_to_node == NULL)
+        goto out;
+
+    numainfo->nodeinfo = (char *)calloc(max_nodes, sizeof(libxl_nodeinfo_t));
+    if (numainfo->nodeinfo == NULL)
+        goto out;
+
+    nodeinfo = (libxl_nodeinfo_t *)numainfo->nodeinfo;
+    node_memsize = xc_hypercall_buffer_alloc(ctx->xch, node_memsize, 
sizeof(*node_memsize) * max_nodes);
+    if ( node_memsize == NULL )
+        goto out;
+    node_memfree = xc_hypercall_buffer_alloc(ctx->xch, node_memfree, 
sizeof(*node_memfree) * max_nodes);
+    if ( node_memfree == NULL )
+        goto out;
+
+    set_xen_guest_handle(ninfo.node_to_memsize, node_memsize);
+    set_xen_guest_handle(ninfo.node_to_memfree, node_memfree);
+    ninfo.max_node_index = max_nodes - 1;
+
+    if ( xc_numainfo(ctx->xch,&ninfo) != 0 )
+        goto out;
+
+    max_nodes = ninfo.max_node_index + 1;
+    numainfo->max_nodes = max_nodes;
+
+    if (libxl_get_topologyinfo(ctx,&topoinfo))
+        goto out;
+
+    for ( i = 0; i<= max_nodes; i++ ) {
+        if (node_memsize[i] != INVALID_MEM_NODE) {
+            nodeinfo[i].online = 1;
+            nodeinfo[i].cpumap = malloc(BITS_TO_LONGS(max_cpus) * sizeof 
(unsigned long));
+            bzero(nodeinfo[i].cpumap, BITS_TO_LONGS(max_cpus) * sizeof 
(unsigned long));
+
+            /* Total Memory */
+            nodeinfo[i].total_memkb = node_memsize[i]>>  10; /* KB */
+
+            /* Free Memory */
+            nodeinfo[i].free_memkb = node_memfree[i]>>  10; /* KB */
+        } else
+            nodeinfo[i].online = 0;
+    }
+
+    for (i = 0; i<  max_cpus; i++)
+        if (topoinfo.coremap.array[i] != LIBXL_CPUARRAY_INVALID_ENTRY) {
+            node = topoinfo.nodemap.array[i];
+            set_bit(i, nodeinfo[node].cpumap);
+            numainfo->cpu_to_node[i] = node;
+        }
+    libxl_topologyinfo_destroy(&topoinfo);
+
+    xc_hypercall_buffer_free(ctx->xch, node_memsize);
+    xc_hypercall_buffer_free(ctx->xch, node_memfree);
+    return 0;
+
+out:
+    if (numainfo->cpu_to_node)
+        free(numainfo->cpu_to_node);
+    if (numainfo->nodeinfo);
+        free(numainfo->nodeinfo);
+    xc_hypercall_buffer_free(ctx->xch, node_memsize);
+    xc_hypercall_buffer_free(ctx->xch, node_memfree);
+    return ERROR_FAIL;
+}
+
+void libxl_free_numainfo(libxl_numainfo_t *numainfo)
+{
+    int i;
+    libxl_nodeinfo_t *nodeinfo = (libxl_nodeinfo_t *)numainfo->nodeinfo;
+
+    for(i = 0; i<  numainfo->max_nodes; i++)
+        if(nodeinfo[i].cpumap)
+            free(nodeinfo[i].cpumap);
+    free(numainfo->cpu_to_node);
+    free(numainfo->nodeinfo);
+}
  const libxl_version_info* libxl_get_version_info(libxl_ctx *ctx)
  {
      union {
diff -r 9aa47ef52e4d tools/libxl/libxl.h
--- a/tools/libxl/libxl.h       Mon Jul 04 06:08:05 2011 +0800
+++ b/tools/libxl/libxl.h       Fri Aug 12 13:51:41 2011 +0800
@@ -496,6 +496,16 @@

  int libxl_get_physinfo(libxl_ctx *ctx, libxl_physinfo *physinfo);
  int libxl_get_topologyinfo(libxl_ctx *ctx, libxl_topologyinfo *info);
+
+#define NUMA_NO_NODE    0xFF
+#define INVALID_MEM_NODE     0ul
+#define BITS_PER_LONG (sizeof(unsigned long) * 8)
+#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
+#define ADDR (*(volatile long *) addr)
+
+int libxl_get_numainfo(libxl_ctx *ctx, libxl_numainfo_t *numainfo);
+void libxl_free_numainfo(libxl_numainfo_t *numainfo);
+
  libxl_vcpuinfo *libxl_list_vcpu(libxl_ctx *ctx, uint32_t domid,
                                         int *nb_vcpu, int *nrcpus);
  int libxl_set_vcpuaffinity(libxl_ctx *ctx, uint32_t domid, uint32_t vcpuid,
diff -r 9aa47ef52e4d tools/libxl/libxl.idl
--- a/tools/libxl/libxl.idl     Mon Jul 04 06:08:05 2011 +0800
+++ b/tools/libxl/libxl.idl     Fri Aug 12 13:51:41 2011 +0800
@@ -291,6 +291,19 @@
      ("socketmap", libxl_cpuarray, False, "cpu to socket map"),
      ("nodemap", libxl_cpuarray,   False, "cpu to node map"),
      ])
+libxl_nodeinfo = Struct("nodeinfo_t", [
+    ("free_memkb", uint32),
+    ("total_memkb", uint32),
+    ("candidate", uint32),
+    ("online", uint32),
+    ("cpumap", string),
+    ])
+libxl_numainfo = Struct("numainfo_t", [
+    ("nodeinfo", string),
+    ("max_nodes", uint32),
+    ("cpu_to_node", string),
+    ("max_cpus", uint32),
+    ])

  libxl_sched_credit = Struct("sched_credit", [
      ("weight", integer),
diff -r 9aa47ef52e4d tools/libxl/libxl_create.c
--- a/tools/libxl/libxl_create.c        Mon Jul 04 06:08:05 2011 +0800
+++ b/tools/libxl/libxl_create.c        Fri Aug 12 13:51:41 2011 +0800
@@ -143,17 +143,110 @@
          console->build_state = state;
      return 0;
  }
+static int find_best_node(libxl_ctx *ctx, libxl_numainfo_t *numainfo)
+{
+    int nr_doms, i, j, nr_vcpu, nrcpus, best_node, pcpu, node_id;
+    unsigned long max_nodes = numainfo->max_nodes;
+    unsigned long *nodeload;
+    libxl_dominfo *dominfo;
+    libxl_vcpuinfo *vcpuinfo;
+    libxl_nodeinfo_t *nodeinfo = (libxl_nodeinfo_t *)numainfo->nodeinfo;
+
+    nodeload = malloc(max_nodes * sizeof(*nodeload));
+    bzero(nodeload, max_nodes * sizeof(*nodeload));
+
+    if (!(dominfo = libxl_list_domain(ctx,&nr_doms)))
+        goto out;
+
+    for (i = 0; i<  nr_doms; i++) {
+        vcpuinfo = libxl_list_vcpu(ctx, dominfo[i].domid,&nr_vcpu,&nrcpus);
+        if (!vcpuinfo)
+            goto out;
+        for (j = 0; j<  nr_vcpu; j++) {
+            if (!vcpuinfo[j].online)
+                continue;
+            pcpu = vcpuinfo[j].cpu;
+            node_id = numainfo->cpu_to_node[pcpu];
+            if (nodeinfo[node_id].candidate)
+                nodeload[node_id]++;
+            else
+                nodeload[node_id] += 8;
+        }
+        free(vcpuinfo);
+    }
+    best_node = 0;
+    for (i = 1; i<  max_nodes; i++)
+        if(nodeinfo[i].candidate&&  nodeinfo[i].online
+&&  nodeload[i]<  nodeload[best_node])
+            best_node = i;
+
+    return best_node;
+out:
+    if (dominfo)
+        free(dominfo);
+    return -1;
+}
+
+static int libxl_node_select(libxl_ctx *ctx, libxl_domain_build_info *b_info, 
uint32_t domid)
+{
+    unsigned long i, best_node;
+    unsigned long needmem = b_info->max_memkb;
+    libxl_numainfo_t numainfo ={ 0 };
+    libxl_nodeinfo_t *nodeinfo;
+    libxl_cpupoolinfo *poolinfo;
+    int n_pools;
+
+    poolinfo = libxl_list_cpupool(ctx,&n_pools);
+    for (i = 0; i<  n_pools; i++)
+         libxl_cpupoolinfo_destroy(poolinfo + i);
+     if (n_pools>  1) {
+         printf("cpupools are being used - skip numa optimization.\n");
+         return 0;
+    }
+
+
+    if (libxl_get_numainfo(ctx,&numainfo)) {
+        fprintf(stderr, "libxl_get_topologyinfo failed.\n");
+        return -1;
+    }
+
+    if (numainfo.max_nodes<  2) {
+        printf("max_nodes = %d\n", numainfo.max_nodes);
+        return 0;
+        }
+
+    nodeinfo = (libxl_nodeinfo_t *)numainfo.nodeinfo;
+    for (i = 0; i<  numainfo.max_nodes; i++)
+        if (nodeinfo[i].free_memkb>  needmem)
+            nodeinfo[i].candidate = 1;
+
+    best_node = find_best_node(ctx,&numainfo);
+    if (best_node == -1) {
+        libxl_numainfo_t_destroy(&numainfo);
+        return -1;
+    }
+
+    for (i = 0; i<  b_info->max_vcpus; i++)
+        xc_vcpu_setaffinity(ctx->xch, domid, i, (uint8_t 
*)(nodeinfo[best_node].cpumap));
+
+    libxl_numainfo_t_destroy(&numainfo);
+    return 0;
+}

  int libxl__domain_build(libxl__gc *gc, libxl_domain_build_info *info, 
uint32_t domid, libxl_domain_build_state *state)
  {
      char **vments = NULL, **localents = NULL;
      struct timeval start_time;
      int i, ret;
+    libxl_ctx *ctx = libxl__gc_owner(gc);

      ret = libxl__build_pre(gc, domid, info, state);
      if (ret)
          goto out;

+    if (libxl_node_select(ctx, info, domid))
+        printf("Cannot find best node, using defaul algorithm\n");
+
      gettimeofday(&start_time, NULL);

      if (info->hvm) {

Juergen

--
Juergen Gross                 Principal Developer Operating Systems
PDG ES&S SWE OS6                       Telephone: +49 (0) 89 3222 2967
Fujitsu Technology Solutions              e-mail: juergen.gross@xxxxxxxxxxxxxx
Domagkstr. 28                           Internet: ts.fujitsu.com
D-80807 Muenchen                 Company details: ts.fujitsu.com/imprint.html


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

<Prev in Thread] Current Thread [Next in Thread>