WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] [xen-unstable] x86: fix NUMA handling (c/s 20599:e5a757c

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [xen-unstable] x86: fix NUMA handling (c/s 20599:e5a757ce7845)
From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Fri, 08 Jan 2010 04:00:15 -0800
Delivery-date: Fri, 08 Jan 2010 04:00:14 -0800
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1262949761 0
# Node ID 5e8b6ecd045e827f3229f3a2fb15621946c50a6b
# Parent  cba56c13ca3eba67a3b56e78256418fd62445a95
x86: fix NUMA handling (c/s 20599:e5a757ce7845)

c/s 20599 caused the hash shift to become significantly smaller on
systems with an SRAT like this

(XEN) SRAT: Node 0 PXM 0 0-a0000
(XEN) SRAT: Node 0 PXM 0 100000-80000000
(XEN) SRAT: Node 1 PXM 1 80000000-d0000000
(XEN) SRAT: Node 1 PXM 1 100000000-130000000

Comined with the static size of the memnodemap[] array, NUMA got
therefore disabled on such systems. The backport from Linux was really
incomplete, as Linux much earlier had already introduced a dynamcially
allocated memnodemap[].

Further, doing to/from pdx translations on addresses just past a valid
range is not correct, as it may strip/fail to insert non-zero bits in
this case.

Finally, using 63 as the cover-it-all shift value is invalid on 32bit,
since pdx values are unsigned long.

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxxxx>
---
 xen/arch/x86/numa.c        |   45 +++++++++++++++++++++++++++++++++++++--------
 xen/include/asm-x86/numa.h |    6 +++---
 2 files changed, 40 insertions(+), 11 deletions(-)

diff -r cba56c13ca3e -r 5e8b6ecd045e xen/arch/x86/numa.c
--- a/xen/arch/x86/numa.c       Wed Jan 06 12:45:23 2010 +0000
+++ b/xen/arch/x86/numa.c       Fri Jan 08 11:22:41 2010 +0000
@@ -30,7 +30,9 @@ struct node_data node_data[MAX_NUMNODES]
 
 /* Mapping from pdx to node id */
 int memnode_shift;
-u8  memnodemap[NODEMAPSIZE];
+static typeof(*memnodemap) _memnodemap[2];
+unsigned long memnodemapsize;
+u8 *memnodemap;
 
 unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
        [0 ... NR_CPUS-1] = NUMA_NO_NODE
@@ -62,13 +64,13 @@ static int __init populate_memnodemap(co
        unsigned long spdx, epdx;
        int i, res = -1;
 
-       memset(memnodemap, NUMA_NO_NODE, sizeof(memnodemap));
+       memset(memnodemap, NUMA_NO_NODE, memnodemapsize * sizeof(*memnodemap));
        for (i = 0; i < numnodes; i++) {
                spdx = paddr_to_pdx(nodes[i].start);
-               epdx = paddr_to_pdx(nodes[i].end);
+               epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
                if (spdx >= epdx)
                        continue;
-               if ((epdx >> shift) >= NODEMAPSIZE)
+               if ((epdx >> shift) >= memnodemapsize)
                        return 0;
                do {
                        if (memnodemap[spdx >> shift] != NUMA_NO_NODE)
@@ -84,6 +86,28 @@ static int __init populate_memnodemap(co
                res = 1;
        }
        return res;
+}
+
+static int __init allocate_cachealigned_memnodemap(void)
+{
+       unsigned long size = PFN_UP(memnodemapsize * sizeof(*memnodemap));
+       unsigned long mfn = alloc_boot_pages(size, 1);
+
+       if (!mfn) {
+               printk(KERN_ERR
+                      "NUMA: Unable to allocate Memory to Node hash map\n");
+               memnodemapsize = 0;
+               return -1;
+       }
+
+       memnodemap = mfn_to_virt(mfn);
+       mfn <<= PAGE_SHIFT;
+       size <<= PAGE_SHIFT;
+       printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
+              mfn, mfn + size);
+       memnodemapsize = size / sizeof(*memnodemap);
+
+       return 0;
 }
 
 /*
@@ -99,7 +123,7 @@ static int __init extract_lsb_from_nodes
 
        for (i = 0; i < numnodes; i++) {
                spdx = paddr_to_pdx(nodes[i].start);
-               epdx = paddr_to_pdx(nodes[i].end);
+               epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
                if (spdx >= epdx)
                        continue;
                bitfield |= spdx;
@@ -108,9 +132,10 @@ static int __init extract_lsb_from_nodes
                        memtop = epdx;
        }
        if (nodes_used <= 1)
-               i = 63;
+               i = BITS_PER_LONG - 1;
        else
                i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
+       memnodemapsize = (memtop >> i) + 1;
        return i;
 }
 
@@ -120,6 +145,10 @@ int __init compute_hash_shift(struct nod
        int shift;
 
        shift = extract_lsb_from_nodes(nodes, numnodes);
+       if (memnodemapsize <= ARRAY_SIZE(_memnodemap))
+               memnodemap = _memnodemap;
+       else if (allocate_cachealigned_memnodemap())
+               return -1;
        printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
                shift);
 
@@ -233,8 +262,8 @@ void __init numa_initmem_init(unsigned l
               (u64)start_pfn << PAGE_SHIFT,
               (u64)end_pfn << PAGE_SHIFT);
        /* setup dummy node covering all memory */ 
-       memnode_shift = 63; 
-       memnodemap[0] = 0;
+       memnode_shift = BITS_PER_LONG - 1;
+       memnodemap = _memnodemap;
        nodes_clear(node_online_map);
        node_set_online(0);
        for (i = 0; i < NR_CPUS; i++)
diff -r cba56c13ca3e -r 5e8b6ecd045e xen/include/asm-x86/numa.h
--- a/xen/include/asm-x86/numa.h        Wed Jan 06 12:45:23 2010 +0000
+++ b/xen/include/asm-x86/numa.h        Fri Jan 08 11:22:41 2010 +0000
@@ -25,7 +25,6 @@ extern int pxm_to_node(int nid);
 
 #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
 #define VIRTUAL_BUG_ON(x) 
-#define NODEMAPSIZE 0xfff
 
 extern void numa_add_cpu(int cpu);
 extern void numa_init_array(void);
@@ -51,7 +50,8 @@ static inline void clear_node_cpumask(in
 
 /* Simple perfect hash to map pdx to node numbers */
 extern int memnode_shift; 
-extern u8  memnodemap[NODEMAPSIZE]; 
+extern unsigned long memnodemapsize;
+extern u8 *memnodemap;
 
 struct node_data {
     unsigned long node_start_pfn;
@@ -64,7 +64,7 @@ static inline __attribute__((pure)) int 
 static inline __attribute__((pure)) int phys_to_nid(paddr_t addr) 
 { 
        unsigned nid;
-       VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >= NODEMAPSIZE);
+       VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >= memnodemapsize);
        nid = memnodemap[paddr_to_pdx(addr) >> memnode_shift]; 
        VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); 
        return nid; 

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] [xen-unstable] x86: fix NUMA handling (c/s 20599:e5a757ce7845), Xen patchbot-unstable <=