WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH 4/6][RESEND] xen: Add NUMA support to Xen

To: xen-devel@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-devel] [PATCH 4/6][RESEND] xen: Add NUMA support to Xen
From: Ryan Harper <ryanh@xxxxxxxxxx>
Date: Fri, 12 May 2006 10:12:46 -0500
Cc: Ryan Grimm <grimm@xxxxxxxxxx>
Delivery-date: Fri, 12 May 2006 08:15:43 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
In-reply-to: <20060501215909.GX16776@xxxxxxxxxx>
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
References: <20060501215909.GX16776@xxxxxxxxxx>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
User-agent: Mutt/1.5.6+20040907i
* Ryan Harper <ryanh@xxxxxxxxxx> [2006-05-01 17:01]:
> This patch exports NUMA specific information collected by the
> hypervisor in the physinfo hypercall.  This additional information
> is also integrated into the xm info command which will display
> the NUMA information.
> 
> nr_nodes is now calculated from num_online_nodes, rather than a
> hard-coded value of 1.
> 
> nr_nodes               : 2
> 
> We display the 64-bit address of each memory chunk and which node
> to which it belongs.
> 
> mem_chunks             : node0:0x0000000000000000-0x000000000009ffff
>                          node0:0x0000000000100000-0x000000007fffffff
>                          node1:0x0000000080000000-0x00000000dfffffff
> 
> 
> We provide a node to cpu mapping as well.  The cpu value is a
> collapsed range, so for example, on a two node 32-way, the
> node_to_cpu value might look like:
> 
> node_to_cpu            : node0:0-15
>                          node1:16-31
>

Dropped CONFIG_NUMA ifdefs.

-- 
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
(512) 838-9253   T/L: 678-9253
ryanh@xxxxxxxxxx


diffstat output:
 b/xen/include/public/numa_structs.h                 |   26 +++++++
 tools/libxc/xc_misc.c                               |    3 
 tools/libxc/xenctrl.h                               |    3 
 tools/python/xen/lowlevel/xc/xc.c                   |   66 ++++++++++++++++++-
 tools/python/xen/xend/XendNode.py                   |   67 ++++++++++++++++++++
 tools/xm-test/tests/info/02_info_compiledata_pos.py |    4 -
 xen/arch/x86/dom0_ops.c                             |   43 ++++++++++++
 xen/include/public/arch-x86_32.h                    |    1 
 xen/include/public/arch-x86_64.h                    |    1 
 xen/include/public/dom0_ops.h                       |    4 +
 xen/include/xen/numa.h                              |    9 --
 11 files changed, 211 insertions(+), 16 deletions(-)

Signed-off-by: Ryan Harper <ryanh@xxxxxxxxxx>
Signed-off-by: Ryan Grimm <grimm@xxxxxxxxxx>
---
# HG changeset patch
# User Ryan Harper <ryanh@xxxxxxxxxx>
# Node ID b13712d6a6154d4610b894adab7f89ee7b3683d4
# Parent  b92d38d9be2808b73dd87e0f3d61858540dc8f69
This patch exports NUMA specific information collected by the
hypervisor in the physinfo hypercall.  This additional information
is also integrated into the xm info command which will display
the NUMA information.

nr_nodes is now calculated from num_online_nodes, rather than a
hard-coded value of 1.

nr_nodes               : 2

We display the 64-bit address of each memory chunk and which node
to which it belongs.

mem_chunks             : node0:0x0000000000000000-0x000000000009ffff
                         node0:0x0000000000100000-0x000000007fffffff
                         node1:0x0000000080000000-0x00000000dfffffff


We provide a node to cpu mapping as well.  The cpu value is a
collapsed range, so for example, on a two node 32-way, the
node_to_cpu value might look like:

node_to_cpu            : node0:0-15
                         node1:16-31

diff -r b92d38d9be28 -r b13712d6a615 tools/libxc/xc_misc.c
--- a/tools/libxc/xc_misc.c     Thu May 11 20:49:50 2006
+++ b/tools/libxc/xc_misc.c     Thu May 11 21:01:06 2006
@@ -40,6 +40,9 @@
 
     op.cmd = DOM0_PHYSINFO;
     op.interface_version = DOM0_INTERFACE_VERSION;
+    /* set pointers to caller's so memcpy doesn't clobber them */
+    op.u.physinfo.memory_chunks = put_info->memory_chunks;
+    op.u.physinfo.node_to_cpu = put_info->node_to_cpu;
 
     if ( (ret = do_dom0_op(xc_handle, &op)) != 0 )
         return ret;
diff -r b92d38d9be28 -r b13712d6a615 tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h     Thu May 11 20:49:50 2006
+++ b/tools/libxc/xenctrl.h     Thu May 11 21:01:06 2006
@@ -20,6 +20,7 @@
 #include <xen/memory.h>
 #include <xen/acm.h>
 #include <xen/acm_ops.h>
+#include <xen/numa_structs.h>
 
 #ifdef __ia64__
 #define XC_PAGE_SHIFT           14
@@ -391,6 +392,8 @@
                        int clear);
 
 typedef dom0_physinfo_t xc_physinfo_t;
+typedef node_memory_chunk_t xc_memory_chunk_t;
+typedef uint64_t xc_node_to_cpu_t;
 int xc_physinfo(int xc_handle,
                 xc_physinfo_t *info);
 
diff -r b92d38d9be28 -r b13712d6a615 tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Thu May 11 20:49:50 2006
+++ b/tools/python/xen/lowlevel/xc/xc.c Thu May 11 21:01:06 2006
@@ -603,8 +603,21 @@
 {
     xc_physinfo_t info;
     char cpu_cap[128], *p=cpu_cap, *q=cpu_cap;
-    int i;
-    
+    int i,j;
+    PyObject *ret_obj, *memchunk_obj, *node_to_cpu_obj;
+    xc_memory_chunk_t *chunks;
+    xc_node_to_cpu_t  *map;
+
+    /* make space for mem chunks */
+    chunks =  (xc_memory_chunk_t *)malloc( sizeof(xc_memory_chunk_t) * 
+                                     PUBLIC_MAXCHUNKS );
+    set_xen_guest_handle(info.memory_chunks, chunks);
+
+    /* make space for node_to_cpu mapping */
+    map = (xc_node_to_cpu_t *)malloc( sizeof(xc_node_to_cpu_t) *
+                                    PUBLIC_MAX_NUMNODES ); 
+    set_xen_guest_handle(info.node_to_cpu, map);
+
     if ( xc_physinfo(self->xc_handle, &info) != 0 )
         return PyErr_SetFromErrno(xc_error);
 
@@ -617,16 +630,59 @@
     }
     if(q>cpu_cap)
         *(q-1)=0;
-
-    return Py_BuildValue("{s:i,s:i,s:i,s:i,s:l,s:l,s:i,s:s}",
+    
+    ret_obj = Py_BuildValue("{s:i,s:i,s:i,s:l,s:l,s:i,s:s}",
                          "threads_per_core", info.threads_per_core,
                          "cores_per_socket", info.cores_per_socket,
                          "sockets_per_node", info.sockets_per_node,
-                         "nr_nodes",         info.nr_nodes,
                          "total_memory",     pages_to_mb(info.total_pages),
                          "free_memory",      pages_to_mb(info.free_pages),
                          "cpu_khz",          info.cpu_khz,
                          "hw_caps",          cpu_cap);
+     
+    /* memchunks */
+    memchunk_obj = PyList_New(0);
+ 
+    /* build list of each memchunk's attributes */
+    for ( i=0; i<info.nr_chunks; i++ ) 
+    {
+        PyList_Append(memchunk_obj, 
+                      Py_BuildValue("{s:i,s:K,s:K}",
+                      "node"       , chunks[i].nid,
+                      "start_paddr", chunks[i].start_paddr,
+                      "end_paddr"  , chunks[i].end_paddr));
+    }
+    /* add list of attributes and nr_chunks to physinfo dictionary */
+    PyDict_SetItemString(ret_obj, "mem_chunks", memchunk_obj);
+    PyDict_SetItemString(ret_obj, "nr_chunks", 
+             Py_BuildValue("i", info.nr_chunks));
+ 
+    /* node to cpu mappings */
+    node_to_cpu_obj = PyList_New(0);
+    /* build list of node to cpu mappings */
+    for ( i=0; i<info.nr_nodes; i++ )
+    {
+        cpumap_t cpumap = (cpumap_t)map[i];
+        PyObject *cpus = PyList_New(0);
+ 
+        for ( j=0; cpumap != 0; j++ ) 
+        {
+            if ( cpumap & 1 )
+                PyList_Append(cpus, PyInt_FromLong(j));
+            cpumap >>=1;
+        }
+        PyList_Append(node_to_cpu_obj, cpus); 
+    }
+    /* add list of node to cpu mappings and nr_nodes to physinfo dictionary */
+    PyDict_SetItemString(ret_obj, "node_to_cpu",  node_to_cpu_obj);
+    PyDict_SetItemString(ret_obj, "nr_nodes", 
+             Py_BuildValue("i", info.nr_nodes));
+
+    /* free malloc'd memory */
+    free(chunks);
+    free(map);
+ 
+    return ret_obj;
 }
 
 static PyObject *pyxc_xeninfo(XcObject *self)
diff -r b92d38d9be28 -r b13712d6a615 tools/python/xen/xend/XendNode.py
--- a/tools/python/xen/xend/XendNode.py Thu May 11 20:49:50 2006
+++ b/tools/python/xen/xend/XendNode.py Thu May 11 21:01:06 2006
@@ -56,6 +56,69 @@
                 ['version', ver],
                 ['machine', mch]]
 
+    def list_to_rangepairs(self,cmap):
+            cmap.sort()
+            pairs = []
+            x = y = 0
+            for i in range(0,len(cmap)):
+                try:
+                    if ((cmap[y+1] - cmap[i]) > 1):
+                        pairs.append((cmap[x],cmap[y]))
+                        x = y = i+1
+                    else:
+                        y = y + 1
+                # if we go off the end, then just add x to y
+                except IndexError:
+                    pairs.append((cmap[x],cmap[y]))
+
+            return pairs
+
+    def format_pairs(self,pairs):
+            if not pairs:
+                return "no cpus"
+            out = ""
+            for f,s in pairs:
+                if (f==s):
+                    out += '%d'%f
+                else:
+                    out += '%d-%d'%(f,s)
+                out += ','
+            # trim trailing ','
+            return out[:-1]
+
+    def list_to_strrange(self,list):
+        return self.format_pairs(self.list_to_rangepairs(list))
+
+    def format_memchunks(self, pinfo):
+        str=''
+        whitespace=''
+        try:
+            chunk=pinfo['mem_chunks']
+            for i in range(0, pinfo['nr_chunks']):
+                str+='%snode%d:0x%016x-0x%016x\n' % (whitespace,
+                                                    chunk[i]['node'],
+                                                    chunk[i]['start_paddr'], 
+                                                    chunk[i]['end_paddr']) 
+                whitespace='%25s' % ''
+        except:
+            str='none\n' 
+        return str[:-1]
+        
+    def format_node_to_cpu(self, pinfo):
+        str=''
+        whitespace=''
+        try:
+            node_to_cpu=pinfo['node_to_cpu']
+            for i in range(0, pinfo['nr_nodes']):
+                str+='%snode%d:%s\n' % (whitespace,
+                                        i, 
+                                      self.list_to_strrange(node_to_cpu[i]))
+                whitespace='%25s' % ''        
+        except:
+            str='none\n'
+        return str[:-1];
+
+
     def physinfo(self):
         info = self.xc.physinfo()
 
@@ -64,6 +127,8 @@
                            info['cores_per_socket'] *
                            info['threads_per_core'])
         info['cpu_mhz'] = info['cpu_khz'] / 1000
+        info['mem_chunks'] = self.format_memchunks(info)
+        info['node_to_cpu'] = self.format_node_to_cpu(info)
 
         ITEM_ORDER = ['nr_cpus',
                       'nr_nodes',
@@ -74,6 +139,8 @@
                       'hw_caps',
                       'total_memory',
                       'free_memory',
+                      'mem_chunks',
+                      'node_to_cpu'
                       ]
 
         return [[k, info[k]] for k in ITEM_ORDER]
diff -r b92d38d9be28 -r b13712d6a615 
tools/xm-test/tests/info/02_info_compiledata_pos.py
--- a/tools/xm-test/tests/info/02_info_compiledata_pos.py       Thu May 11 
20:49:50 2006
+++ b/tools/xm-test/tests/info/02_info_compiledata_pos.py       Thu May 11 
21:01:06 2006
@@ -18,9 +18,7 @@
 for line in lines:
     pieces = line.split(" : ", 1)
 
-    if len(pieces) < 2:
-        FAIL("Found invalid line: [%s]" % line)
-    else:
+    if len(pieces) > 1:
         map[pieces[0]] = pieces[1]
 
 for field in ["cores_per_socket", "threads_per_core", "cpu_mhz",
diff -r b92d38d9be28 -r b13712d6a615 xen/arch/x86/dom0_ops.c
--- a/xen/arch/x86/dom0_ops.c   Thu May 11 20:49:50 2006
+++ b/xen/arch/x86/dom0_ops.c   Thu May 11 21:01:06 2006
@@ -25,6 +25,7 @@
 #include <asm/hvm/support.h>
 #include <asm/processor.h>
 #include <public/sched_ctl.h>
+#include <xen/numa.h>
 
 #include <asm/mtrr.h>
 #include "cpu/mtrr/mtrr.h"
@@ -183,6 +184,8 @@
     case DOM0_PHYSINFO:
     {
         dom0_physinfo_t *pi = &op->u.physinfo;
+        int i;
+        u64 node_to_cpu_64[MAX_NUMNODES];
 
         pi->threads_per_core =
             cpus_weight(cpu_sibling_map[0]);
@@ -191,12 +194,50 @@
         pi->sockets_per_node = 
             num_online_cpus() / cpus_weight(cpu_core_map[0]);
 
-        pi->nr_nodes         = 1;
         pi->total_pages      = total_pages;
         pi->free_pages       = avail_domheap_pages();
         pi->cpu_khz          = cpu_khz;
         memset(pi->hw_cap, 0, sizeof(pi->hw_cap));
         memcpy(pi->hw_cap, boot_cpu_data.x86_capability, NCAPINTS*4);
+
+        /* memory chunks */
+        pi->nr_chunks = num_memory_chunks;
+
+        DPRINTK("num_memory_chunks:%d\n", num_memory_chunks);
+        for ( i=0; i<num_memory_chunks; i++ ) {
+            DPRINTK("node%d:%"PRIx64"\n", node_memory_chunk[i].nid,
+                                          node_memory_chunk[i].start_paddr);
+            DPRINTK("node%d:%"PRIx64"\n", node_memory_chunk[i].nid,
+                                          node_memory_chunk[i].end_paddr);
+
+            /* copy memory chunk structs to guest */
+            ret = 0;
+            if ( copy_to_guest_offset(op->u.physinfo.memory_chunks, i, 
+                                      &(node_memory_chunk[i]), 1) ) {
+                ret = -EFAULT;
+                break;
+            }
+        }
+
+        /* node to cpu mask */
+        pi->nr_nodes = num_online_nodes();
+
+        /* copy cpu to node mapping to domU */
+        /* converting cpumask to u64 b/c userspace doesn't 
+         * know about cpumask_t and is accepting a u64 */
+        memset(node_to_cpu_64, 0, sizeof(node_to_cpu_64));
+        for ( i=0; i<pi->nr_nodes; i++) {
+            int j = 0;
+            for ( j=0; j<num_online_cpus(); j++)
+                if ( cpu_isset(j, node_to_cpumask[i]) )
+                    node_to_cpu_64[i] |= (u64)1 << j;
+
+            if ( copy_to_guest_offset(op->u.physinfo.node_to_cpu, 
+                                      i, &(node_to_cpu_64[i]), 1) ) {
+                ret = -EFAULT;
+                break;
+            }
+        }
         ret = 0;
         if ( copy_to_guest(u_dom0_op, op, 1) )
             ret = -EFAULT;
diff -r b92d38d9be28 -r b13712d6a615 xen/include/public/arch-x86_32.h
--- a/xen/include/public/arch-x86_32.h  Thu May 11 20:49:50 2006
+++ b/xen/include/public/arch-x86_32.h  Thu May 11 21:01:06 2006
@@ -24,6 +24,7 @@
 __DEFINE_XEN_GUEST_HANDLE(uchar, unsigned char);
 __DEFINE_XEN_GUEST_HANDLE(uint,  unsigned int);
 __DEFINE_XEN_GUEST_HANDLE(ulong, unsigned long);
+__DEFINE_XEN_GUEST_HANDLE(u64, uint64_t);
 DEFINE_XEN_GUEST_HANDLE(char);
 DEFINE_XEN_GUEST_HANDLE(int);
 DEFINE_XEN_GUEST_HANDLE(long);
diff -r b92d38d9be28 -r b13712d6a615 xen/include/public/arch-x86_64.h
--- a/xen/include/public/arch-x86_64.h  Thu May 11 20:49:50 2006
+++ b/xen/include/public/arch-x86_64.h  Thu May 11 21:01:06 2006
@@ -24,6 +24,7 @@
 __DEFINE_XEN_GUEST_HANDLE(uchar, unsigned char);
 __DEFINE_XEN_GUEST_HANDLE(uint,  unsigned int);
 __DEFINE_XEN_GUEST_HANDLE(ulong, unsigned long);
+__DEFINE_XEN_GUEST_HANDLE(u64, uint64_t);
 DEFINE_XEN_GUEST_HANDLE(char);
 DEFINE_XEN_GUEST_HANDLE(int);
 DEFINE_XEN_GUEST_HANDLE(long);
diff -r b92d38d9be28 -r b13712d6a615 xen/include/public/dom0_ops.h
--- a/xen/include/public/dom0_ops.h     Thu May 11 20:49:50 2006
+++ b/xen/include/public/dom0_ops.h     Thu May 11 21:01:06 2006
@@ -13,6 +13,7 @@
 
 #include "xen.h"
 #include "sched_ctl.h"
+#include "numa_structs.h"
 
 /*
  * Make sure you increment the interface version whenever you modify this file!
@@ -219,6 +220,9 @@
     unsigned long total_pages;
     unsigned long free_pages;
     uint32_t hw_cap[8];
+    uint32_t nr_chunks;
+    XEN_GUEST_HANDLE(node_memory_chunk_t) memory_chunks;
+    XEN_GUEST_HANDLE(u64) node_to_cpu;
 } dom0_physinfo_t;
 DEFINE_XEN_GUEST_HANDLE(dom0_physinfo_t);
 
diff -r b92d38d9be28 -r b13712d6a615 xen/include/xen/numa.h
--- a/xen/include/xen/numa.h    Thu May 11 20:49:50 2006
+++ b/xen/include/xen/numa.h    Thu May 11 21:01:06 2006
@@ -2,6 +2,7 @@
 #define _XEN_NUMA_H
 
 #include <xen/config.h>
+#include <public/numa_structs.h>
 
 #ifdef CONFIG_DISCONTIGMEM
 #include <asm/numnodes.h>
@@ -19,13 +20,7 @@
 #define MAX_CHUNKS_PER_NODE   4
 #define MAXCHUNKS    (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
 
-typedef struct node_memory_chunk {
-   uint64_t start_paddr; /* physical address of chunk start */
-   uint64_t end_paddr;   /* physical address of chunk end */
-   uint8_t pxm;          /* proximity domain of node */
-   uint8_t nid;          /* which cnode contains this chunk? */
-} node_memory_chunk_t;
-DEFINE_XEN_GUEST_HANDLE(node_memory_chunk_t);
+#include <xen/nodemask.h>
 
 extern node_memory_chunk_t node_memory_chunk[];
 extern int num_memory_chunks;
diff -r b92d38d9be28 -r b13712d6a615 xen/include/public/numa_structs.h
--- /dev/null   Thu May 11 20:49:50 2006
+++ b/xen/include/public/numa_structs.h Thu May 11 21:01:06 2006
@@ -0,0 +1,26 @@
+/*
+ * Ryan Grimm <grimm@xxxxxxxxxx>
+ * Copyright (c) 2006, International Business Machines Corporation.
+ *
+ */
+
+#ifndef __XEN_PUBLIC_NUMA_STRUCTS_H__
+
+#define __XEN_PUBLIC_NUMA_STRUCTS_H__
+
+#include "xen.h"
+
+/* define these for xc to use b/c MAX_NUMNODES and MAX_CHUNKS
+ * are not exposed in /public */
+#define PUBLIC_MAX_NUMNODES 16
+#define PUBLIC_MAXCHUNKS 32
+
+typedef struct node_memory_chunk {
+   uint64_t start_paddr; /* physical address of chunk start */
+   uint64_t end_paddr;   /* physical address of chunk end */
+   uint8_t pxm;          /* proximity domain of node */
+   uint8_t nid;          /* which cnode contains this chunk? */
+} node_memory_chunk_t;
+DEFINE_XEN_GUEST_HANDLE(node_memory_chunk_t);
+
+#endif

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

<Prev in Thread] Current Thread [Next in Thread>