* Keir Fraser <keir@xxxxxxxxxxxxx> [2007-04-10 04:13]:
> On 10/4/07 02:09, "Ryan Harper" <ryanh@xxxxxxxxxx> wrote:
>
> > nr_nodes : 4
> > mem_chunks : node0:0x0000000000000000-0x0000000190000000
> > node1:0x0000000190000000-0x0000000300000000
> > node2:0x0000000300000000-0x0000000470000000
> > node3:0x0000000470000000-0x0000000640000000
> > node_to_cpu : node0:0-7
> > node1:8-15
> > node2:16-23
> > node3:24-31
> >
> > I've also reworked the the physinfo call to contain an array of
> > cpu_to_node elements rather than node_to_cpu to support machines larger
> > than 64-way. I convert the array back to node_to_cpu for brevity in
> > xm info display.
>
> The same would make sense for memory regions (i.e., have a list of
> memory-regions and include a node identifier for each one, rather than
> mapping node-id to memory-region) as this would allow to have multiple
> memory regions per node quite easily. But actually I'm not convinced that
> allowing dom0 to read out the physical addresses of memory regions is at all
> useful -- why would anyone care which particular physical address ranges
> belong to a particular node? The hypercall to find amount of free memory per
> node seems more useful, and probably sufficient by itself.
Updated.
- Dropped mem_chunks (removed that from existing ia64 NUMA physinfo)
- Fixed up ia64 cpu_to_node_map array size (was MAX_NUMNODES, now
NR_CPUS)
- Fixed sockets_per_node calculation (was bogus on Opteron systems)
- Updated all arches physinfo call to use num_online_nodes() and new
sockets_per_node calculation
Untested on ia64, ppc.
--
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
(512) 838-9253 T/L: 678-9253
ryanh@xxxxxxxxxx
diffstat output:
tools/libxc/xc_misc.c | 3
tools/libxc/xenctrl.h | 1
tools/python/xen/lowlevel/xc/xc.c | 61 ++++++++++++++++----
tools/python/xen/xend/XendNode.py | 50 ++++++++++++++++
tools/xenmon/xenbaked.c | 3
tools/xenstat/libxenstat/src/xenstat.c | 3
tools/xentrace/xentrace.c | 3
tools/xm-test/tests/info/02_info_compiledata_pos.py | 4 -
xen/arch/ia64/xen/dom0_ops.c | 46 +--------------
xen/arch/powerpc/sysctl.c | 6 -
xen/arch/x86/sysctl.c | 33 +++++++++-
xen/include/public/sysctl.h | 1
12 files changed, 152 insertions(+), 62 deletions(-)
Signed-off-by: Ryan Harper <ryanh@xxxxxxxxxx>
---
This patch modifies the physinfo hcall to export NUMA cpu_to_node
topology information. The new physinfo hcall is integrated into libxc
and xend (xm info specifically). Included in this patch is a minor
tweak to xm-test's xm info testcase. I've also fixed the
sockets_per_node calculation. The new fields in xm info are:
nr_cpus : 32
nr_nodes : 4
sockets_per_node : 4
cores_per_socket : 1
threads_per_core : 2
...
node_to_cpu : node0:0-7
node1:8-15
node2:16-23
node3:24-31
I've also reworked the the physinfo call to contain an array of
cpu_to_node elements rather than node_to_cpu to support machines larger
than 64-ways. I convert the array back to node_to_cpu for brevity in
xm info display.
Signed-off-by: Ryan Harper <ryanh@xxxxxxxxxx>
diff -r 400a3dca237e tools/libxc/xc_misc.c
--- a/tools/libxc/xc_misc.c Mon Apr 09 12:05:26 2007 +0100
+++ b/tools/libxc/xc_misc.c Fri Apr 13 13:04:24 2007 -0500
@@ -59,6 +59,9 @@ int xc_physinfo(int xc_handle,
DECLARE_SYSCTL;
sysctl.cmd = XEN_SYSCTL_physinfo;
+
+ /* set pointers to caller's so memcpy doesn't clobber them */
+ sysctl.u.physinfo.cpu_to_node = put_info->cpu_to_node;
if ( (ret = do_sysctl(xc_handle, &sysctl)) != 0 )
return ret;
diff -r 400a3dca237e tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h Mon Apr 09 12:05:26 2007 +0100
+++ b/tools/libxc/xenctrl.h Fri Apr 13 13:04:24 2007 -0500
@@ -473,6 +473,7 @@ int xc_send_debug_keys(int xc_handle, ch
int xc_send_debug_keys(int xc_handle, char *keys);
typedef xen_sysctl_physinfo_t xc_physinfo_t;
+typedef uint32_t xc_cpu_to_node_t;
int xc_physinfo(int xc_handle,
xc_physinfo_t *info);
diff -r 400a3dca237e tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Mon Apr 09 12:05:26 2007 +0100
+++ b/tools/python/xen/lowlevel/xc/xc.c Fri Apr 13 15:41:39 2007 -0500
@@ -640,14 +640,26 @@ static PyObject *pyxc_pages_to_kib(XcObj
}
+#define MAX_NR_CPUS 256
static PyObject *pyxc_physinfo(XcObject *self)
{
xc_physinfo_t info;
char cpu_cap[128], *p=cpu_cap, *q=cpu_cap;
- int i;
+ int i,j, nr_cpus;
+ PyObject *ret_obj, *node_to_cpu_obj;
+ xc_cpu_to_node_t *map;
+
+ /* make space for cpu_to_node mapping, up to MAX_NR_CPUS cpus */
+ map = (xc_cpu_to_node_t *)malloc( sizeof(xc_cpu_to_node_t) * MAX_NR_CPUS);
+
+ set_xen_guest_handle(info.cpu_to_node, map);
if ( xc_physinfo(self->xc_handle, &info) != 0 )
return pyxc_error_to_exception();
+
+ /* calc number of cpus */
+ nr_cpus = info.threads_per_core * info.cores_per_socket *
+ info.sockets_per_node * info.nr_nodes;
*q=0;
for(i=0;i<sizeof(info.hw_cap)/4;i++)
@@ -659,16 +671,43 @@ static PyObject *pyxc_physinfo(XcObject
if(q>cpu_cap)
*(q-1)=0;
- return Py_BuildValue("{s:i,s:i,s:i,s:i,s:l,s:l,s:l,s:i,s:s}",
- "threads_per_core", info.threads_per_core,
- "cores_per_socket", info.cores_per_socket,
- "sockets_per_node", info.sockets_per_node,
- "nr_nodes", info.nr_nodes,
- "total_memory", pages_to_kib(info.total_pages),
- "free_memory", pages_to_kib(info.free_pages),
- "scrub_memory", pages_to_kib(info.scrub_pages),
- "cpu_khz", info.cpu_khz,
- "hw_caps", cpu_cap);
+ ret_obj = Py_BuildValue("{s:i,s:i,s:i,s:l,s:l,s:l,s:i,s:s}",
+ "threads_per_core", info.threads_per_core,
+ "cores_per_socket", info.cores_per_socket,
+ "sockets_per_node", info.sockets_per_node,
+ "total_memory", pages_to_kib(info.total_pages),
+ "free_memory", pages_to_kib(info.free_pages),
+ "scrub_memory", pages_to_kib(info.scrub_pages),
+ "cpu_khz", info.cpu_khz,
+ "hw_caps", cpu_cap);
+
+ /* node to cpu mappings */
+ node_to_cpu_obj = PyList_New(0);
+
+ /* make a list for each node */
+ for ( i=0; i<info.nr_nodes; i++)
+ {
+ PyObject *cpus = PyList_New(0);
+
+ /* walk the cpu_to_node array, for each cpu
+ which maps to node i, add to cpus list */
+ for ( j=0; j<nr_cpus; j++)
+ {
+ /* this cpu j maps to node i */
+ if ( i == (uint32_t)map[j])
+ PyList_Append(cpus, PyInt_FromLong(j));
+ }
+ PyList_Append(node_to_cpu_obj, cpus);
+ }
+ /* add list of node to cpu mappings and nr_nodes to physinfo dictionary */
+ PyDict_SetItemString(ret_obj, "node_to_cpu", node_to_cpu_obj);
+ PyDict_SetItemString(ret_obj, "nr_nodes",
+ Py_BuildValue("i", info.nr_nodes));
+
+ /* free malloc'd memory */
+ free(map);
+
+ return ret_obj;
}
static PyObject *pyxc_xeninfo(XcObject *self)
diff -r 400a3dca237e tools/python/xen/xend/XendNode.py
--- a/tools/python/xen/xend/XendNode.py Mon Apr 09 12:05:26 2007 +0100
+++ b/tools/python/xen/xend/XendNode.py Fri Apr 13 13:04:24 2007 -0500
@@ -547,6 +547,54 @@ class XendNode:
['version', ver],
['machine', mch]]
+ def list_to_rangepairs(self,cmap):
+ cmap.sort()
+ pairs = []
+ x = y = 0
+ for i in range(0,len(cmap)):
+ try:
+ if ((cmap[y+1] - cmap[i]) > 1):
+ pairs.append((cmap[x],cmap[y]))
+ x = y = i+1
+ else:
+ y = y + 1
+ # if we go off the end, then just add x to y
+ except IndexError:
+ pairs.append((cmap[x],cmap[y]))
+
+ return pairs
+
+ def format_pairs(self,pairs):
+ if not pairs:
+ return "no cpus"
+ out = ""
+ for f,s in pairs:
+ if (f==s):
+ out += '%d'%f
+ else:
+ out += '%d-%d'%(f,s)
+ out += ','
+ # trim trailing ','
+ return out[:-1]
+
+ def list_to_strrange(self,list):
+ return self.format_pairs(self.list_to_rangepairs(list))
+
+ def format_node_to_cpu(self, pinfo):
+ str=''
+ whitespace=''
+ try:
+ node_to_cpu=pinfo['node_to_cpu']
+ for i in range(0, pinfo['nr_nodes']):
+ str+='%snode%d:%s\n' % (whitespace,
+ i,
+ self.list_to_strrange(node_to_cpu[i]))
+ whitespace='%25s' % ''
+ except:
+ str='none\n'
+ return str[:-1];
+
+
def physinfo(self):
info = self.xc.physinfo()
@@ -559,6 +607,7 @@ class XendNode:
# physinfo is in KiB, need it in MiB
info['total_memory'] = info['total_memory'] / 1024
info['free_memory'] = info['free_memory'] / 1024
+ info['node_to_cpu'] = self.format_node_to_cpu(info)
ITEM_ORDER = ['nr_cpus',
'nr_nodes',
@@ -569,6 +618,7 @@ class XendNode:
'hw_caps',
'total_memory',
'free_memory',
+ 'node_to_cpu'
]
return [[k, info[k]] for k in ITEM_ORDER]
diff -r 400a3dca237e tools/xenmon/xenbaked.c
--- a/tools/xenmon/xenbaked.c Mon Apr 09 12:05:26 2007 +0100
+++ b/tools/xenmon/xenbaked.c Fri Apr 13 13:04:24 2007 -0500
@@ -448,6 +448,9 @@ unsigned int get_num_cpus(void)
int xc_handle = xc_interface_open();
int ret;
+ /* ensure node_to_cpu is NULL */
+ memset(&physinfo, 0, sizeof(physinfo));
+
ret = xc_physinfo(xc_handle, &physinfo);
if ( ret != 0 )
diff -r 400a3dca237e tools/xenstat/libxenstat/src/xenstat.c
--- a/tools/xenstat/libxenstat/src/xenstat.c Mon Apr 09 12:05:26 2007 +0100
+++ b/tools/xenstat/libxenstat/src/xenstat.c Fri Apr 13 13:04:24 2007 -0500
@@ -147,6 +147,9 @@ xenstat_node *xenstat_get_node(xenstat_h
/* Store the handle in the node for later access */
node->handle = handle;
+
+ /* ensure node_to_cpu is NULL */
+ memset(&physinfo, 0, sizeof(physinfo));
/* Get information about the physical system */
if (xc_physinfo(handle->xc_handle, &physinfo) < 0) {
diff -r 400a3dca237e tools/xentrace/xentrace.c
--- a/tools/xentrace/xentrace.c Mon Apr 09 12:05:26 2007 +0100
+++ b/tools/xentrace/xentrace.c Fri Apr 13 13:04:24 2007 -0500
@@ -260,6 +260,9 @@ unsigned int get_num_cpus(void)
int xc_handle = xc_interface_open();
int ret;
+ /* ensure node_to_cpu is NULL */
+ memset(&physinfo, 0, sizeof(physinfo));
+
ret = xc_physinfo(xc_handle, &physinfo);
if ( ret != 0 )
diff -r 400a3dca237e tools/xm-test/tests/info/02_info_compiledata_pos.py
--- a/tools/xm-test/tests/info/02_info_compiledata_pos.py Mon Apr 09
12:05:26 2007 +0100
+++ b/tools/xm-test/tests/info/02_info_compiledata_pos.py Fri Apr 13
13:04:24 2007 -0500
@@ -18,9 +18,7 @@ for line in lines:
for line in lines:
pieces = line.split(" : ", 1)
- if len(pieces) < 2:
- FAIL("Found invalid line: [%s]" % line)
- else:
+ if len(pieces) > 1:
map[pieces[0]] = pieces[1]
for field in ["cores_per_socket", "threads_per_core", "cpu_mhz",
diff -r 400a3dca237e xen/arch/ia64/xen/dom0_ops.c
--- a/xen/arch/ia64/xen/dom0_ops.c Mon Apr 09 12:05:26 2007 +0100
+++ b/xen/arch/ia64/xen/dom0_ops.c Fri Apr 13 13:20:38 2007 -0500
@@ -239,8 +239,7 @@ long arch_do_sysctl(xen_sysctl_t *op, XE
{
#ifdef IA64_NUMA_PHYSINFO
int i;
- node_data_t *chunks;
- u64 *map, cpu_to_node_map[MAX_NUMNODES];
+ uint32_t *map, cpu_to_node_map[NR_CPUS];
#endif
xen_sysctl_physinfo_t *pi = &op->u.physinfo;
@@ -249,11 +248,9 @@ long arch_do_sysctl(xen_sysctl_t *op, XE
cpus_weight(cpu_sibling_map[0]);
pi->cores_per_socket =
cpus_weight(cpu_core_map[0]) / pi->threads_per_core;
- pi->sockets_per_node =
- num_online_cpus() / cpus_weight(cpu_core_map[0]);
-#ifndef IA64_NUMA_PHYSINFO
- pi->nr_nodes = 1;
-#endif
+ pi->nr_nodes = num_online_nodes();
+ pi->sockets_per_node = num_online_cpus() /
+ (pi->nr_nodes * pi->cores_per_socket * pi->threads_per_core);
pi->total_pages = total_pages;
pi->free_pages = avail_domheap_pages();
pi->scrub_pages = avail_scrub_pages();
@@ -263,41 +260,6 @@ long arch_do_sysctl(xen_sysctl_t *op, XE
ret = 0;
#ifdef IA64_NUMA_PHYSINFO
- /* fetch memory_chunk pointer from guest */
- get_xen_guest_handle(chunks, pi->memory_chunks);
-
- printk("chunks=%p, num_node_memblks=%u\n", chunks, num_node_memblks);
- /* if it is set, fill out memory chunk array */
- if (chunks != NULL) {
- if (num_node_memblks == 0) {
- /* Non-NUMA machine. Put pseudo-values. */
- node_data_t data;
- data.node_start_pfn = 0;
- data.node_spanned_pages = total_pages;
- data.node_id = 0;
- /* copy memory chunk structs to guest */
- if (copy_to_guest_offset(pi->memory_chunks, 0, &data, 1)) {
- ret = -EFAULT;
- break;
- }
- } else {
- for (i = 0; i < num_node_memblks && i < PUBLIC_MAXCHUNKS; i++)
{
- node_data_t data;
- data.node_start_pfn = node_memblk[i].start_paddr >>
- PAGE_SHIFT;
- data.node_spanned_pages = node_memblk[i].size >>
PAGE_SHIFT;
- data.node_id = node_memblk[i].nid;
- /* copy memory chunk structs to guest */
- if (copy_to_guest_offset(pi->memory_chunks, i, &data, 1)) {
- ret = -EFAULT;
- break;
- }
- }
- }
- }
- /* set number of notes */
- pi->nr_nodes = num_online_nodes();
-
/* fetch cpu_to_node pointer from guest */
get_xen_guest_handle(map, pi->cpu_to_node);
diff -r 400a3dca237e xen/arch/powerpc/sysctl.c
--- a/xen/arch/powerpc/sysctl.c Mon Apr 09 12:05:26 2007 +0100
+++ b/xen/arch/powerpc/sysctl.c Fri Apr 13 13:09:31 2007 -0500
@@ -45,10 +45,10 @@ long arch_do_sysctl(struct xen_sysctl *s
cpus_weight(cpu_sibling_map[0]);
pi->cores_per_socket =
cpus_weight(cpu_core_map[0]) / pi->threads_per_core;
- pi->sockets_per_node =
- num_online_cpus() / cpus_weight(cpu_core_map[0]);
+ pi->sockets_per_node = num_online_cpus() /
+ (num_online_nodes() * pi->cores_per_socket * pi->threads_per_core);
- pi->nr_nodes = 1;
+ pi->nr_nodes = num_online_nodes();
pi->total_pages = total_pages;
pi->free_pages = avail_domheap_pages();
pi->cpu_khz = cpu_khz;
diff -r 400a3dca237e xen/arch/x86/sysctl.c
--- a/xen/arch/x86/sysctl.c Mon Apr 09 12:05:26 2007 +0100
+++ b/xen/arch/x86/sysctl.c Fri Apr 13 13:11:15 2007 -0500
@@ -23,6 +23,10 @@
#include <asm/hvm/hvm.h>
#include <asm/hvm/support.h>
#include <asm/processor.h>
+#include <asm/numa.h>
+#include <xen/nodemask.h>
+
+#define get_xen_guest_handle(val, hnd) do { val = (hnd).p; } while (0)
long arch_do_sysctl(
struct xen_sysctl *sysctl, XEN_GUEST_HANDLE(xen_sysctl_t) u_sysctl)
@@ -34,16 +38,19 @@ long arch_do_sysctl(
case XEN_SYSCTL_physinfo:
{
+ int i;
+ uint32_t *map, cpu_to_node_map[NR_CPUS];
+
xen_sysctl_physinfo_t *pi = &sysctl->u.physinfo;
pi->threads_per_core =
cpus_weight(cpu_sibling_map[0]);
pi->cores_per_socket =
cpus_weight(cpu_core_map[0]) / pi->threads_per_core;
- pi->sockets_per_node =
- num_online_cpus() / cpus_weight(cpu_core_map[0]);
+ pi->nr_nodes = num_online_nodes();
+ pi->sockets_per_node = num_online_cpus() /
+ (pi->nr_nodes * pi->cores_per_socket * pi->threads_per_core);
- pi->nr_nodes = 1;
pi->total_pages = total_pages;
pi->free_pages = avail_domheap_pages();
pi->scrub_pages = avail_scrub_pages();
@@ -51,6 +58,26 @@ long arch_do_sysctl(
memset(pi->hw_cap, 0, sizeof(pi->hw_cap));
memcpy(pi->hw_cap, boot_cpu_data.x86_capability, NCAPINTS*4);
ret = 0;
+
+ /* fetch cpu_to_node pointer from guest */
+ get_xen_guest_handle(map, sysctl->u.physinfo.cpu_to_node);
+
+ /* if set, fill out cpu_to_node array */
+ if ( map != NULL )
+ {
+ /* for each cpu, mark in which node the cpu belongs */
+ memset(cpu_to_node_map, 0, sizeof(cpu_to_node_map));
+ for ( i = 0; i < num_online_cpus(); i++)
+ {
+ cpu_to_node_map[i]=cpu_to_node(i);
+ if ( copy_to_guest_offset(sysctl->u.physinfo.cpu_to_node,
+ i, &(cpu_to_node_map[i]), 1) ) {
+ ret = -EFAULT;
+ break;
+ }
+ }
+ }
+
if ( copy_to_guest(u_sysctl, sysctl, 1) )
ret = -EFAULT;
}
diff -r 400a3dca237e xen/include/public/sysctl.h
--- a/xen/include/public/sysctl.h Mon Apr 09 12:05:26 2007 +0100
+++ b/xen/include/public/sysctl.h Fri Apr 13 13:04:24 2007 -0500
@@ -85,6 +85,7 @@ struct xen_sysctl_physinfo {
uint64_aligned_t free_pages;
uint64_aligned_t scrub_pages;
uint32_t hw_cap[8];
+ XEN_GUEST_HANDLE(uint32_t) cpu_to_node;
};
typedef struct xen_sysctl_physinfo xen_sysctl_physinfo_t;
DEFINE_XEN_GUEST_HANDLE(xen_sysctl_physinfo_t);
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|