WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] [xen-unstable] [XEN] Add basic NUMA/SRAT support to Xen

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [xen-unstable] [XEN] Add basic NUMA/SRAT support to Xen from Linux 2.6.16.29.
From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Wed, 25 Oct 2006 15:20:16 +0000
Delivery-date: Wed, 25 Oct 2006 08:20:49 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User kfraser@xxxxxxxxxxxxxxxxxxxxx
# Node ID f312c2d01d8b9a3c237543b65157da83696cbff5
# Parent  a1f987e9640f3824b15158be1ba0d426503e282f
[XEN] Add basic NUMA/SRAT support to Xen from Linux 2.6.16.29.
Signed-off-by: Ryan Harper <ryanh@xxxxxxxxxx>
---
 xen/arch/x86/Makefile                        |    2 
 xen/arch/x86/numa.c                          |  302 +++++++++++++++++++++++
 xen/arch/x86/setup.c                         |   34 ++
 xen/arch/x86/smpboot.c                       |    4 
 xen/arch/x86/srat.c                          |  325 +++++++++++++++++++++++++
 xen/drivers/acpi/Makefile                    |    1 
 xen/drivers/acpi/numa.c                      |  216 +++++++++++++++++
 xen/include/asm-x86/acpi.h                   |    3 
 xen/include/asm-x86/config.h                 |    5 
 xen/include/asm-x86/mach-generic/mach_apic.h |    6 
 xen/include/asm-x86/numa.h                   |   65 +++++
 xen/include/asm-x86/numnodes.h               |   26 ++
 xen/include/asm-x86/topology.h               |   40 +++
 xen/include/xen/config.h                     |    2 
 xen/include/xen/nodemask.h                   |  342 +++++++++++++++++++++++++++
 xen/include/xen/numa.h                       |   35 ++
 xen/include/xen/topology.h                   |   27 ++
 17 files changed, 1428 insertions(+), 7 deletions(-)

diff -r a1f987e9640f -r f312c2d01d8b xen/arch/x86/Makefile
--- a/xen/arch/x86/Makefile     Wed Oct 25 11:51:23 2006 +0100
+++ b/xen/arch/x86/Makefile     Wed Oct 25 12:25:54 2006 +0100
@@ -28,12 +28,14 @@ obj-y += mm.o
 obj-y += mm.o
 obj-y += mpparse.o
 obj-y += nmi.o
+obj-y += numa.o
 obj-y += physdev.o
 obj-y += rwlock.o
 obj-y += setup.o
 obj-y += shutdown.o
 obj-y += smp.o
 obj-y += smpboot.o
+obj-y += srat.o
 obj-y += string.o
 obj-y += sysctl.o
 obj-y += time.o
diff -r a1f987e9640f -r f312c2d01d8b xen/arch/x86/setup.c
--- a/xen/arch/x86/setup.c      Wed Oct 25 11:51:23 2006 +0100
+++ b/xen/arch/x86/setup.c      Wed Oct 25 12:25:54 2006 +0100
@@ -16,6 +16,7 @@
 #include <xen/percpu.h>
 #include <xen/hypercall.h>
 #include <xen/keyhandler.h>
+#include <xen/numa.h>
 #include <public/version.h>
 #include <asm/bitops.h>
 #include <asm/smp.h>
@@ -25,10 +26,12 @@
 #include <asm/desc.h>
 #include <asm/shadow.h>
 #include <asm/e820.h>
+#include <asm/numa.h>
 #include <acm/acm_hooks.h>
 
 extern void dmi_scan_machine(void);
 extern void generic_apic_probe(void);
+extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
 
 /*
  * opt_xenheap_megabytes: Size of Xen heap in megabytes, excluding the
@@ -59,6 +62,9 @@ boolean_param("watchdog", opt_watchdog);
 /* "acpi=noirq":  Disables ACPI interrupt routing.                  */
 static void parse_acpi_param(char *s);
 custom_param("acpi", parse_acpi_param);
+
+extern int numa_setup(char *s);
+custom_param("numa", numa_setup);
 
 /* **** Linux config option: propagated to domain0. */
 /* acpi_skip_timer_override: Skip IRQ0 overrides. */
@@ -255,6 +261,20 @@ static void __init init_idle_domain(void
     idle_vcpu[0] = this_cpu(curr_vcpu) = current;
 
     setup_idle_pagetable();
+}
+
+static void srat_detect_node(int cpu)
+{
+   unsigned node;
+   u8 apicid = x86_cpu_to_apicid[cpu];
+
+   node = apicid_to_node[apicid];
+   if (node == NUMA_NO_NODE)
+      node = 0;
+   numa_set_node(cpu, node);
+
+   if (acpi_numa > 0)
+      printk(KERN_INFO "CPU %d APIC %d -> Node %d\n", cpu, apicid, node);
 }
 
 void __init __start_xen(multiboot_info_t *mbi)
@@ -485,6 +505,12 @@ void __init __start_xen(multiboot_info_t
 
     init_frametable();
 
+    acpi_boot_table_init();
+
+    acpi_numa_init();
+
+    numa_initmem_init(0, max_page);
+
     end_boot_allocator();
 
     /* Initialise the Xen heap, skipping RAM holes. */
@@ -536,8 +562,9 @@ void __init __start_xen(multiboot_info_t
 
     generic_apic_probe();
 
-    acpi_boot_table_init();
     acpi_boot_init();
+
+    init_cpu_to_node();
 
     if ( smp_found_config )
         get_smp_config();
@@ -589,6 +616,11 @@ void __init __start_xen(multiboot_info_t
             break;
         if ( !cpu_online(i) )
             __cpu_up(i);
+
+               /* setup cpu_to_node[] */
+        srat_detect_node(i);
+               /* setup node_to_cpumask based on cpu_to_node[] */
+        numa_add_cpu(i);        
     }
 
     printk("Brought up %ld CPUs\n", (long)num_online_cpus());
diff -r a1f987e9640f -r f312c2d01d8b xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c    Wed Oct 25 11:51:23 2006 +0100
+++ b/xen/arch/x86/smpboot.c    Wed Oct 25 12:25:54 2006 +0100
@@ -43,6 +43,8 @@
 #include <xen/delay.h>
 #include <xen/softirq.h>
 #include <xen/serial.h>
+#include <xen/numa.h>
+#include <asm/numa.h>
 #include <asm/current.h>
 #include <asm/mc146818rtc.h>
 #include <asm/desc.h>
@@ -628,7 +630,7 @@ static void map_cpu_to_logical_apicid(vo
 static void map_cpu_to_logical_apicid(void)
 {
        int cpu = smp_processor_id();
-       int apicid = logical_smp_processor_id();
+       int apicid = hard_smp_processor_id();
 
        cpu_2_logical_apicid[cpu] = apicid;
        map_cpu_to_node(cpu, apicid_to_node(apicid));
diff -r a1f987e9640f -r f312c2d01d8b xen/drivers/acpi/Makefile
--- a/xen/drivers/acpi/Makefile Wed Oct 25 11:51:23 2006 +0100
+++ b/xen/drivers/acpi/Makefile Wed Oct 25 12:25:54 2006 +0100
@@ -1,1 +1,2 @@ obj-y += tables.o
 obj-y += tables.o
+obj-y += numa.o
diff -r a1f987e9640f -r f312c2d01d8b xen/include/asm-x86/acpi.h
--- a/xen/include/asm-x86/acpi.h        Wed Oct 25 11:51:23 2006 +0100
+++ b/xen/include/asm-x86/acpi.h        Wed Oct 25 12:25:54 2006 +0100
@@ -157,6 +157,8 @@ static inline void check_acpi_pci(void) 
 
 static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
 static inline int acpi_irq_balance_set(char *str) { return 0; }
+extern int acpi_scan_nodes(u64 start, u64 end);
+extern int acpi_numa;
 
 #ifdef CONFIG_ACPI_SLEEP
 
@@ -173,5 +175,6 @@ extern void acpi_reserve_bootmem(void);
 #endif /*CONFIG_ACPI_SLEEP*/
 
 extern u8 x86_acpiid_to_apicid[];
+#define MAX_LOCAL_APIC 256
 
 #endif /*_ASM_ACPI_H*/
diff -r a1f987e9640f -r f312c2d01d8b xen/include/asm-x86/config.h
--- a/xen/include/asm-x86/config.h      Wed Oct 25 11:51:23 2006 +0100
+++ b/xen/include/asm-x86/config.h      Wed Oct 25 12:25:54 2006 +0100
@@ -24,6 +24,11 @@
 #define CONFIG_X86_IO_APIC 1
 #define CONFIG_HPET_TIMER 1
 #define CONFIG_X86_MCE_P4THERMAL 1
+#define CONFIG_ACPI_NUMA 1
+#define CONFIG_NUMA 1
+#define CONFIG_ACPI_SRAT 1
+#define CONFIG_DISCONTIGMEM 1
+#define CONFIG_NUMA_EMU 1
 
 /* Intel P4 currently has largest cache line (L2 line size is 128 bytes). */
 #define CONFIG_X86_L1_CACHE_SHIFT 7
diff -r a1f987e9640f -r f312c2d01d8b 
xen/include/asm-x86/mach-generic/mach_apic.h
--- a/xen/include/asm-x86/mach-generic/mach_apic.h      Wed Oct 25 11:51:23 
2006 +0100
+++ b/xen/include/asm-x86/mach-generic/mach_apic.h      Wed Oct 25 12:25:54 
2006 +0100
@@ -22,11 +22,7 @@ static inline void enable_apic_mode(void
        return;
 }
 
-/* No sane NUMA support right now. We should parse ACPI SRAT. */
-static inline int apicid_to_node(int logical_apicid)
-{
-       return 0;
-}
+#define apicid_to_node(apicid) ((int)apicid_to_node[(u8)apicid])
 
 extern u8 bios_cpu_apicid[];
 static inline int cpu_present_to_apicid(int mps_cpu)
diff -r a1f987e9640f -r f312c2d01d8b xen/include/xen/config.h
--- a/xen/include/xen/config.h  Wed Oct 25 11:51:23 2006 +0100
+++ b/xen/include/xen/config.h  Wed Oct 25 12:25:54 2006 +0100
@@ -50,5 +50,7 @@
 #endif /* !__ASSEMBLY__ */
 
 #define fastcall
+#define __cpuinitdata
+#define __cpuinit
 
 #endif /* __XEN_CONFIG_H__ */
diff -r a1f987e9640f -r f312c2d01d8b xen/arch/x86/numa.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/numa.c       Wed Oct 25 12:25:54 2006 +0100
@@ -0,0 +1,302 @@
+/* 
+ * Generic VM initialization for x86-64 NUMA setups.
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Adapted for Xen: Ryan Harper <ryanh@xxxxxxxxxx>
+ */ 
+
+#include <xen/mm.h>
+#include <xen/string.h>
+#include <xen/init.h>
+#include <xen/ctype.h>
+#include <xen/nodemask.h>
+#include <xen/numa.h>
+#include <xen/keyhandler.h>
+#include <xen/time.h>
+
+#include <asm/numa.h>
+#include <asm/acpi.h>
+
+#ifndef Dprintk
+#define Dprintk(x...)
+#endif
+
+/* from proto.h */
+#define round_up(x,y) ((((x)+(y))-1) & (~((y)-1)))
+
+struct node_data node_data[MAX_NUMNODES];
+
+int memnode_shift;
+u8  memnodemap[NODEMAPSIZE];
+
+unsigned int cpu_to_node[NR_CPUS] __read_mostly = {
+       [0 ... NR_CPUS-1] = NUMA_NO_NODE
+};
+unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
+       [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
+cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
+
+nodemask_t node_online_map = { { [0] = 1UL } };
+
+int numa_off __initdata;
+
+int acpi_numa __initdata;
+
+/*
+ * Given a shift value, try to populate memnodemap[]
+ * Returns :
+ * 1 if OK
+ * 0 if memnodmap[] too small (of shift too small)
+ * -1 if node overlap or lost ram (shift too big)
+ */
+static int __init
+populate_memnodemap(const struct node *nodes, int numnodes, int shift)
+{
+       int i; 
+       int res = -1;
+       unsigned long addr, end;
+
+       if (shift >= 64)
+               return -1;
+       memset(memnodemap, 0xff, sizeof(memnodemap));
+       for (i = 0; i < numnodes; i++) {
+               addr = nodes[i].start;
+               end = nodes[i].end;
+               if (addr >= end)
+                       continue;
+               if ((end >> shift) >= NODEMAPSIZE)
+                       return 0;
+               do {
+                       if (memnodemap[addr >> shift] != 0xff)
+                               return -1;
+                       memnodemap[addr >> shift] = i;
+                       addr += (1UL << shift);
+               } while (addr < end);
+               res = 1;
+       } 
+       return res;
+}
+
+int __init compute_hash_shift(struct node *nodes, int numnodes)
+{
+       int shift = 20;
+
+       while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
+               shift++;
+
+       printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
+               shift);
+
+       if (populate_memnodemap(nodes, numnodes, shift) != 1) {
+               printk(KERN_INFO
+       "Your memory is not aligned you need to rebuild your kernel "
+       "with a bigger NODEMAPSIZE shift=%d\n",
+                       shift);
+               return -1;
+       }
+       return shift;
+}
+
+/* initialize NODE_DATA given nodeid and start/end */
+void __init setup_node_bootmem(int nodeid, u64 start, u64 end)
+{ 
+       unsigned long start_pfn, end_pfn;
+
+       start_pfn = start >> PAGE_SHIFT;
+       end_pfn = end >> PAGE_SHIFT;
+
+       NODE_DATA(nodeid)->node_id = nodeid;
+       NODE_DATA(nodeid)->node_start_pfn = start_pfn;
+       NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
+
+       node_set_online(nodeid);
+} 
+
+void __init numa_init_array(void)
+{
+       int rr, i;
+       /* There are unfortunately some poorly designed mainboards around
+          that only connect memory to a single CPU. This breaks the 1:1 
cpu->node
+          mapping. To avoid this fill in the mapping for all possible
+          CPUs, as the number of CPUs is not known yet. 
+          We round robin the existing nodes. */
+       rr = first_node(node_online_map);
+       for (i = 0; i < NR_CPUS; i++) {
+               if (cpu_to_node[i] != NUMA_NO_NODE)
+                       continue;
+               numa_set_node(i, rr);
+               rr = next_node(rr, node_online_map);
+               if (rr == MAX_NUMNODES)
+                       rr = first_node(node_online_map);
+       }
+
+}
+
+#ifdef CONFIG_NUMA_EMU
+/* default to faking a single node as fallback for non-NUMA hardware */
+int numa_fake __initdata = 1;
+
+/* Numa emulation */
+static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
+{
+       int i;
+       struct node nodes[MAX_NUMNODES];
+       unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
+
+       /* Kludge needed for the hash function */
+       if (hweight64(sz) > 1) {
+               unsigned long x = 1;
+               while ((x << 1) < sz)
+                       x <<= 1;
+               if (x < sz/2)
+                       printk(KERN_ERR "Numa emulation unbalanced. Complain to 
maintainer\n");
+               sz = x;
+       }
+
+       memset(&nodes,0,sizeof(nodes));
+       for (i = 0; i < numa_fake; i++) {
+               nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
+               if (i == numa_fake-1)
+                       sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
+               nodes[i].end = nodes[i].start + sz;
+               printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64" 
(%"PRIu64"MB)\n",
+                      i,
+                      nodes[i].start, nodes[i].end,
+                      (nodes[i].end - nodes[i].start) >> 20);
+               node_set_online(i);
+       }
+       memnode_shift = compute_hash_shift(nodes, numa_fake);
+       if (memnode_shift < 0) {
+               memnode_shift = 0;
+               printk(KERN_ERR "No NUMA hash function found. Emulation 
disabled.\n");
+               return -1;
+       }
+       for_each_online_node(i)
+               setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+       numa_init_array();
+       return 0;
+}
+#endif
+
+void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+{ 
+       int i;
+
+#ifdef CONFIG_ACPI_NUMA
+       if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
+                                         end_pfn << PAGE_SHIFT))
+               return;
+#endif
+
+#ifdef CONFIG_NUMA_EMU
+   /* fake a numa node for non-numa hardware */
+       if (numa_fake && !numa_emulation(start_pfn, end_pfn))
+               return;
+#endif
+
+       printk(KERN_INFO "%s\n",
+              numa_off ? "NUMA turned off" : "No NUMA configuration found");
+
+       printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 
+              start_pfn << PAGE_SHIFT,
+              end_pfn << PAGE_SHIFT); 
+               /* setup dummy node covering all memory */ 
+       memnode_shift = 63; 
+       memnodemap[0] = 0;
+       nodes_clear(node_online_map);
+       node_set_online(0);
+       for (i = 0; i < NR_CPUS; i++)
+               numa_set_node(i, 0);
+       node_to_cpumask[0] = cpumask_of_cpu(0);
+       setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
+}
+
+__cpuinit void numa_add_cpu(int cpu)
+{
+       set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
+} 
+
+void __cpuinit numa_set_node(int cpu, int node)
+{
+       cpu_to_node[cpu] = node;
+}
+
+/* [numa=off] */
+__init int numa_setup(char *opt) 
+{ 
+       if (!strncmp(opt,"off",3))
+               numa_off = 1;
+#ifdef CONFIG_NUMA_EMU
+       if(!strncmp(opt, "fake=", 5)) {
+               numa_fake = simple_strtoul(opt+5,NULL,0); ;
+               if (numa_fake >= MAX_NUMNODES)
+                       numa_fake = MAX_NUMNODES;
+       }
+#endif
+#ifdef CONFIG_ACPI_NUMA
+       if (!strncmp(opt,"noacpi",6))
+               acpi_numa = -1;
+#endif
+       return 1;
+} 
+
+/*
+ * Setup early cpu_to_node.
+ *
+ * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
+ * and apicid_to_node[] tables have valid entries for a CPU.
+ * This means we skip cpu_to_node[] initialisation for NUMA
+ * emulation and faking node case (when running a kernel compiled
+ * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
+ * is already initialized in a round robin manner at numa_init_array,
+ * prior to this call, and this initialization is good enough
+ * for the fake NUMA cases.
+ */
+void __init init_cpu_to_node(void)
+{
+       int i;
+       for (i = 0; i < NR_CPUS; i++) {
+               u8 apicid = x86_cpu_to_apicid[i];
+               if (apicid == BAD_APICID)
+                       continue;
+               if (apicid_to_node[apicid] == NUMA_NO_NODE)
+                       continue;
+               numa_set_node(i,apicid_to_node[apicid]);
+       }
+}
+
+EXPORT_SYMBOL(cpu_to_node);
+EXPORT_SYMBOL(node_to_cpumask);
+EXPORT_SYMBOL(memnode_shift);
+EXPORT_SYMBOL(memnodemap);
+EXPORT_SYMBOL(node_data);
+
+static void dump_numa(unsigned char key)
+{
+       s_time_t now = NOW();
+       int i;
+
+       printk("'%c' pressed -> dumping numa info (now-0x%X:%08X)\n", key,
+                 (u32)(now>>32), (u32)now);
+
+       for_each_online_node(i) {
+               unsigned long pa = (NODE_DATA(i)->node_start_pfn + 1)<< 
PAGE_SHIFT;
+               printk("idx%d -> NODE%d start->%lu size->%lu\n",
+                         i, NODE_DATA(i)->node_id,
+                         NODE_DATA(i)->node_start_pfn,
+                         NODE_DATA(i)->node_spanned_pages);
+               /* sanity check phys_to_nid() */
+               printk("phys_to_nid(%lx) -> %d should be %d\n", pa, 
phys_to_nid(pa),
+                         NODE_DATA(i)->node_id);
+       }
+       for_each_online_cpu(i)
+               printk("CPU%d -> NODE%d\n", i, cpu_to_node[i]);
+}
+
+static __init int register_numa_trigger(void)
+{
+       register_keyhandler('u', dump_numa, "dump numa info");
+       return 0;
+}
+__initcall(register_numa_trigger);
+
diff -r a1f987e9640f -r f312c2d01d8b xen/arch/x86/srat.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/srat.c       Wed Oct 25 12:25:54 2006 +0100
@@ -0,0 +1,325 @@
+/*
+ * ACPI 3.0 based NUMA setup
+ * Copyright 2004 Andi Kleen, SuSE Labs.
+ *
+ * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
+ *
+ * Called from acpi_numa_init while reading the SRAT and SLIT tables.
+ * Assumes all memory regions belonging to a single proximity domain
+ * are in one chunk. Holes between them will be included in the node.
+ * 
+ * Adapted for Xen: Ryan Harper <ryanh@xxxxxxxxxx>
+ */
+
+#if 0
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/proto.h>
+#include <xen/bitmap.h>
+#include <xen/numa.h>
+#include <xen/topology.h>
+#include <asm/e820.h>
+#endif
+#include <xen/init.h>
+#include <xen/mm.h>
+#include <xen/inttypes.h>
+#include <xen/nodemask.h>
+#include <xen/acpi.h>
+
+#include <asm/numa.h>
+#include <asm/page.h>
+
+static struct acpi_table_slit *acpi_slit;
+
+static nodemask_t nodes_parsed __initdata;
+static nodemask_t nodes_found __initdata;
+static struct node nodes[MAX_NUMNODES] __initdata;
+static u8 pxm2node[256] = { [0 ... 255] = 0xff };
+
+/* Too small nodes confuse the VM badly. Usually they result
+   from BIOS bugs. */
+#define NODE_MIN_SIZE (4*1024*1024)
+
+static int node_to_pxm(int n);
+
+int pxm_to_node(int pxm)
+{
+       if ((unsigned)pxm >= 256)
+               return -1;
+       /* Extend 0xff to (int)-1 */
+       return (signed char)pxm2node[pxm];
+}
+
+static __init int setup_node(int pxm)
+{
+       unsigned node = pxm2node[pxm];
+       if (node == 0xff) {
+               if (nodes_weight(nodes_found) >= MAX_NUMNODES)
+                       return -1;
+               node = first_unset_node(nodes_found); 
+               node_set(node, nodes_found);
+               pxm2node[pxm] = node;
+       }
+       return pxm2node[pxm];
+}
+
+static __init int conflicting_nodes(u64 start, u64 end)
+{
+       int i;
+       for_each_node_mask(i, nodes_parsed) {
+               struct node *nd = &nodes[i];
+               if (nd->start == nd->end)
+                       continue;
+               if (nd->end > start && nd->start < end)
+                       return i;
+               if (nd->end == end && nd->start == start)
+                       return i;
+       }
+       return -1;
+}
+
+static __init void cutoff_node(int i, u64 start, u64 end)
+{
+       struct node *nd = &nodes[i];
+       if (nd->start < start) {
+               nd->start = start;
+               if (nd->end < nd->start)
+                       nd->start = nd->end;
+       }
+       if (nd->end > end) {
+               nd->end = end;
+               if (nd->start > nd->end)
+                       nd->start = nd->end;
+       }
+}
+
+static __init void bad_srat(void)
+{
+       int i;
+       printk(KERN_ERR "SRAT: SRAT not used.\n");
+       acpi_numa = -1;
+       for (i = 0; i < MAX_LOCAL_APIC; i++)
+               apicid_to_node[i] = NUMA_NO_NODE;
+}
+
+static __init inline int srat_disabled(void)
+{
+       return numa_off || acpi_numa < 0;
+}
+
+/*
+ * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
+ * up the NUMA heuristics which wants the local node to have a smaller
+ * distance than the others.
+ * Do some quick checks here and only use the SLIT if it passes.
+ */
+static __init int slit_valid(struct acpi_table_slit *slit)
+{
+       int i, j;
+       int d = slit->localities;
+       for (i = 0; i < d; i++) {
+               for (j = 0; j < d; j++)  {
+                       u8 val = slit->entry[d*i + j];
+                       if (i == j) {
+                               if (val != 10)
+                                       return 0;
+                       } else if (val <= 10)
+                               return 0;
+               }
+       }
+       return 1;
+}
+
+/* Callback for SLIT parsing */
+void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
+{
+       if (!slit_valid(slit)) {
+               printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
+               return;
+       }
+       acpi_slit = slit;
+}
+
+/* Callback for Proximity Domain -> LAPIC mapping */
+void __init
+acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
+{
+       int pxm, node;
+       if (srat_disabled())
+               return;
+       if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) 
{                bad_srat();
+               return;
+       }
+       if (pa->flags.enabled == 0)
+               return;
+       pxm = pa->proximity_domain;
+       node = setup_node(pxm);
+       if (node < 0) {
+               printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+               bad_srat();
+               return;
+       }
+       apicid_to_node[pa->apic_id] = node;
+       acpi_numa = 1;
+       printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
+              pxm, pa->apic_id, node);
+}
+
+/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
+void __init
+acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
+{
+       struct node *nd;
+       u64 start, end;
+       int node, pxm;
+       int i;
+
+       if (srat_disabled())
+               return;
+       if (ma->header.length != sizeof(struct acpi_table_memory_affinity)) {
+               bad_srat();
+               return;
+       }
+       if (ma->flags.enabled == 0)
+               return;
+       start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
+       end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
+       pxm = ma->proximity_domain;
+       node = setup_node(pxm);
+       if (node < 0) {
+               printk(KERN_ERR "SRAT: Too many proximity domains.\n");
+               bad_srat();
+               return;
+       }
+       /* It is fine to add this area to the nodes data it will be used later*/
+       if (ma->flags.hot_pluggable == 1)
+               printk(KERN_INFO "SRAT: hot plug zone found %"PRIx64" - 
%"PRIx64" \n",
+                               start, end);
+       i = conflicting_nodes(start, end);
+       if (i == node) {
+               printk(KERN_WARNING
+               "SRAT: Warning: PXM %d (%"PRIx64"-%"PRIx64") overlaps with 
itself (%"
+               PRIx64"-%"PRIx64")\n", pxm, start, end, nodes[i].start, 
nodes[i].end);
+       } else if (i >= 0) {
+               printk(KERN_ERR
+                      "SRAT: PXM %d (%"PRIx64"-%"PRIx64") overlaps with PXM %d 
(%"
+                      PRIx64"-%"PRIx64")\n", pxm, start, end, node_to_pxm(i),
+                          nodes[i].start, nodes[i].end);
+               bad_srat();
+               return;
+       }
+       nd = &nodes[node];
+       if (!node_test_and_set(node, nodes_parsed)) {
+               nd->start = start;
+               nd->end = end;
+       } else {
+               if (start < nd->start)
+                       nd->start = start;
+               if (nd->end < end)
+                       nd->end = end;
+       }
+       printk(KERN_INFO "SRAT: Node %u PXM %u %"PRIx64"-%"PRIx64"\n", node, 
pxm,
+              nd->start, nd->end);
+}
+
+/* Sanity check to catch more bad SRATs (they are amazingly common).
+   Make sure the PXMs cover all memory. */
+static int nodes_cover_memory(void)
+{
+       int i;
+       u64 pxmram, e820ram;
+
+       pxmram = 0;
+       for_each_node_mask(i, nodes_parsed) {
+               u64 s = nodes[i].start >> PAGE_SHIFT;
+               u64 e = nodes[i].end >> PAGE_SHIFT;
+               pxmram += e - s;
+       }
+
+       e820ram = max_page;
+       /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
+       if ((long)(e820ram - pxmram) >= 1*1024*1024) {
+               printk(KERN_ERR "SRAT: PXMs only cover %"PRIu64"MB of your %"
+                       PRIu64"MB e820 RAM. Not used.\n",
+                       (pxmram << PAGE_SHIFT) >> 20,
+                       (e820ram << PAGE_SHIFT) >> 20);
+               return 0;
+       }
+       return 1;
+}
+
+static void unparse_node(int node)
+{
+       int i;
+       node_clear(node, nodes_parsed);
+       for (i = 0; i < MAX_LOCAL_APIC; i++) {
+               if (apicid_to_node[i] == node)
+                       apicid_to_node[i] = NUMA_NO_NODE;
+       }
+}
+
+void __init acpi_numa_arch_fixup(void) {}
+
+/* Use the information discovered above to actually set up the nodes. */
+int __init acpi_scan_nodes(u64 start, u64 end)
+{
+       int i;
+
+       /* First clean up the node list */
+       for (i = 0; i < MAX_NUMNODES; i++) {
+               cutoff_node(i, start, end);
+               if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE)
+                       unparse_node(i);
+       }
+
+       if (acpi_numa <= 0)
+               return -1;
+
+       if (!nodes_cover_memory()) {
+               bad_srat();
+               return -1;
+       }
+
+       memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
+       if (memnode_shift < 0) {
+               printk(KERN_ERR
+                    "SRAT: No NUMA node hash function found. Contact 
maintainer\n");
+               bad_srat();
+               return -1;
+       }
+
+       /* Finally register nodes */
+       for_each_node_mask(i, nodes_parsed)
+               setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+       for (i = 0; i < NR_CPUS; i++) { 
+               if (cpu_to_node[i] == NUMA_NO_NODE)
+                       continue;
+               if (!node_isset(cpu_to_node[i], nodes_parsed))
+                       numa_set_node(i, NUMA_NO_NODE);
+       }
+       numa_init_array();
+       return 0;
+}
+
+static int node_to_pxm(int n)
+{
+       int i;
+       if (pxm2node[n] == n)
+               return n;
+       for (i = 0; i < 256; i++)
+               if (pxm2node[i] == n)
+                       return i;
+       return 0;
+}
+
+int __node_distance(int a, int b)
+{
+       int index;
+
+       if (!acpi_slit)
+               return a == b ? 10 : 20;
+       index = acpi_slit->localities * node_to_pxm(a);
+       return acpi_slit->entry[index + node_to_pxm(b)];
+}
+
+EXPORT_SYMBOL(__node_distance);
diff -r a1f987e9640f -r f312c2d01d8b xen/drivers/acpi/numa.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/drivers/acpi/numa.c   Wed Oct 25 12:25:54 2006 +0100
@@ -0,0 +1,216 @@
+/*
+ *  acpi_numa.c - ACPI NUMA support
+ *
+ *  Copyright (C) 2002 Takayoshi Kochi <t-kochi@xxxxxxxxxxxxx>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ */
+#if 0
+#include <linux/module.h>
+#include <linux/kernel.h>
+#endif
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/errno.h>
+#include <xen/acpi.h>
+#include <xen/numa.h>
+#include <acpi/acpi_bus.h>
+#include <acpi/acmacros.h>
+#include <asm/page.h> /* __va() */
+
+#define ACPI_NUMA      0x80000000
+#define _COMPONENT     ACPI_NUMA
+ACPI_MODULE_NAME("numa")
+
+extern int __init acpi_table_parse_madt_family(enum acpi_table_id id,
+                                              unsigned long madt_size,
+                                              int entry_id,
+                                              acpi_madt_entry_handler handler,
+                                              unsigned int max_entries);
+
+void __init acpi_table_print_srat_entry(acpi_table_entry_header * header)
+{
+
+       ACPI_FUNCTION_NAME("acpi_table_print_srat_entry");
+
+       if (!header)
+               return;
+
+       switch (header->type) {
+
+       case ACPI_SRAT_PROCESSOR_AFFINITY:
+#ifdef ACPI_DEBUG_OUTPUT
+               {
+                       struct acpi_table_processor_affinity *p =
+                           (struct acpi_table_processor_affinity *)header;
+                       ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+                                         "SRAT Processor (id[0x%02x] 
eid[0x%02x]) in proximity domain %d %s\n",
+                                         p->apic_id, p->lsapic_eid,
+                                         p->proximity_domain,
+                                         p->flags.
+                                         enabled ? "enabled" : "disabled"));
+               }
+#endif                         /* ACPI_DEBUG_OUTPUT */
+               break;
+
+       case ACPI_SRAT_MEMORY_AFFINITY:
+#ifdef ACPI_DEBUG_OUTPUT
+               {
+                       struct acpi_table_memory_affinity *p =
+                           (struct acpi_table_memory_affinity *)header;
+                       ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+                                         "SRAT Memory (0x%08x%08x length 
0x%08x%08x type 0x%x) in proximity domain %d %s%s\n",
+                                         p->base_addr_hi, p->base_addr_lo,
+                                         p->length_hi, p->length_lo,
+                                         p->memory_type, p->proximity_domain,
+                                         p->flags.
+                                         enabled ? "enabled" : "disabled",
+                                         p->flags.
+                                         hot_pluggable ? " hot-pluggable" :
+                                         ""));
+               }
+#endif                         /* ACPI_DEBUG_OUTPUT */
+               break;
+
+       default:
+               printk(KERN_WARNING PREFIX
+                      "Found unsupported SRAT entry (type = 0x%x)\n",
+                      header->type);
+               break;
+       }
+}
+
+static int __init acpi_parse_slit(unsigned long phys_addr, unsigned long size)
+{
+       struct acpi_table_slit *slit;
+       u32 localities;
+
+       if (!phys_addr || !size)
+               return -EINVAL;
+
+       slit = (struct acpi_table_slit *)__va(phys_addr);
+
+       /* downcast just for %llu vs %lu for i386/ia64  */
+       localities = (u32) slit->localities;
+
+       acpi_numa_slit_init(slit);
+
+       return 0;
+}
+
+static int __init
+acpi_parse_processor_affinity(acpi_table_entry_header * header,
+                             const unsigned long end)
+{
+       struct acpi_table_processor_affinity *processor_affinity;
+
+       processor_affinity = (struct acpi_table_processor_affinity *)header;
+       if (!processor_affinity)
+               return -EINVAL;
+
+       acpi_table_print_srat_entry(header);
+
+       /* let architecture-dependent part to do it */
+       acpi_numa_processor_affinity_init(processor_affinity);
+
+       return 0;
+}
+
+static int __init
+acpi_parse_memory_affinity(acpi_table_entry_header * header,
+                          const unsigned long end)
+{
+       struct acpi_table_memory_affinity *memory_affinity;
+
+       memory_affinity = (struct acpi_table_memory_affinity *)header;
+       if (!memory_affinity)
+               return -EINVAL;
+
+       acpi_table_print_srat_entry(header);
+
+       /* let architecture-dependent part to do it */
+       acpi_numa_memory_affinity_init(memory_affinity);
+
+       return 0;
+}
+
+static int __init acpi_parse_srat(unsigned long phys_addr, unsigned long size)
+{
+       struct acpi_table_srat *srat;
+
+       if (!phys_addr || !size)
+               return -EINVAL;
+
+       srat = (struct acpi_table_srat *)__va(phys_addr);
+
+       return 0;
+}
+
+int __init
+acpi_table_parse_srat(enum acpi_srat_entry_id id,
+                     acpi_madt_entry_handler handler, unsigned int max_entries)
+{
+       return acpi_table_parse_madt_family(ACPI_SRAT,
+                                           sizeof(struct acpi_table_srat), id,
+                                           handler, max_entries);
+}
+
+int __init acpi_numa_init(void)
+{
+       int result;
+
+       /* SRAT: Static Resource Affinity Table */
+       result = acpi_table_parse(ACPI_SRAT, acpi_parse_srat);
+
+       if (result > 0) {
+               result = acpi_table_parse_srat(ACPI_SRAT_PROCESSOR_AFFINITY,
+                                              acpi_parse_processor_affinity,
+                                              NR_CPUS);
+               result = acpi_table_parse_srat(ACPI_SRAT_MEMORY_AFFINITY, 
acpi_parse_memory_affinity, NR_NODE_MEMBLKS); // IA64 specific
+       }
+
+       /* SLIT: System Locality Information Table */
+       result = acpi_table_parse(ACPI_SLIT, acpi_parse_slit);
+
+       acpi_numa_arch_fixup();
+       return 0;
+}
+
+#if 0
+int acpi_get_pxm(acpi_handle h)
+{
+       unsigned long pxm;
+       acpi_status status;
+       acpi_handle handle;
+       acpi_handle phandle = h;
+
+       do {
+               handle = phandle;
+               status = acpi_evaluate_integer(handle, "_PXM", NULL, &pxm);
+               if (ACPI_SUCCESS(status))
+                       return (int)pxm;
+               status = acpi_get_parent(handle, &phandle);
+       } while (ACPI_SUCCESS(status));
+       return -1;
+}
+
+EXPORT_SYMBOL(acpi_get_pxm);
+#endif
diff -r a1f987e9640f -r f312c2d01d8b xen/include/asm-x86/numa.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/asm-x86/numa.h        Wed Oct 25 12:25:54 2006 +0100
@@ -0,0 +1,65 @@
+#ifndef _ASM_X8664_NUMA_H 
+#define _ASM_X8664_NUMA_H 1
+
+#include <xen/nodemask.h>
+#include <xen/topology.h>
+#include <asm/numnodes.h>
+#include <asm/smp.h>
+
+struct node { 
+       u64 start,end; 
+};
+
+extern int compute_hash_shift(struct node *nodes, int numnodes);
+extern int pxm_to_node(int nid);
+
+#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
+#define VIRTUAL_BUG_ON(x) 
+#define NODEMAPSIZE 0xfff
+
+extern void numa_add_cpu(int cpu);
+extern void numa_init_array(void);
+extern int numa_off;
+
+extern void numa_set_node(int cpu, int node);
+
+extern void setup_node_bootmem(int nodeid, u64 start, u64 end);
+extern unsigned char apicid_to_node[256];
+#ifdef CONFIG_NUMA
+extern void __init init_cpu_to_node(void);
+
+static inline void clear_node_cpumask(int cpu)
+{
+       clear_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
+}
+
+/* Simple perfect hash to map physical addresses to node numbers */
+extern int memnode_shift; 
+extern u8  memnodemap[NODEMAPSIZE]; 
+
+extern struct node_data node_data[];
+
+static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) 
+{ 
+       unsigned nid; 
+       VIRTUAL_BUG_ON((addr >> memnode_shift) >= NODEMAPSIZE);
+       nid = memnodemap[addr >> memnode_shift]; 
+       VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); 
+       return nid; 
+} 
+
+#define NODE_DATA(nid)         (&(node_data[nid]))
+
+#define node_start_pfn(nid)    (NODE_DATA(nid)->node_start_pfn)
+#define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn + \
+                                NODE_DATA(nid)->node_spanned_pages)
+
+
+#else
+#define init_cpu_to_node() do {} while (0)
+#define clear_node_cpumask(cpu) do {} while (0)
+#endif
+
+#define NUMA_NO_NODE 0xff
+
+#endif
diff -r a1f987e9640f -r f312c2d01d8b xen/include/asm-x86/numnodes.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/asm-x86/numnodes.h    Wed Oct 25 12:25:54 2006 +0100
@@ -0,0 +1,26 @@
+#ifndef _ASM_MAX_NUMNODES_H
+#define _ASM_MAX_NUMNODES_H
+
+#include <xen/config.h>
+
+#if defined(__i386__)
+#ifdef CONFIG_X86_NUMAQ
+
+/* Max 16 Nodes */
+#define NODES_SHIFT    4
+
+#elif defined(CONFIG_ACPI_SRAT)
+
+/* Max 8 Nodes */
+#define NODES_SHIFT    3
+
+#endif /* CONFIG_X86_NUMAQ */
+
+
+#endif /* __i386__ */
+
+#if defined(CONFIG_NUMA) && defined(__x86_64__)
+#define NODES_SHIFT  6
+#endif /* __x86_64__ */
+
+#endif /* _ASM_MAX_NUMNODES_H */
diff -r a1f987e9640f -r f312c2d01d8b xen/include/asm-x86/topology.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/asm-x86/topology.h    Wed Oct 25 12:25:54 2006 +0100
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2006, IBM Corp.
+ *
+ * All rights reserved.          
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Ryan Harper <ryanh@xxxxxxxxxx>
+ */
+
+#ifndef _ASM_X86_TOPOLOGY_H
+#define _ASM_X86_TOPOLOGY_H
+
+#include <xen/config.h>
+#include <xen/bitops.h>
+
+extern cpumask_t cpu_online_map;
+
+extern unsigned int cpu_to_node[];
+extern cpumask_t     node_to_cpumask[];
+
+#define cpu_to_node(cpu)               (cpu_to_node[cpu])
+#define parent_node(node)              (node)
+#define node_to_first_cpu(node)  (__ffs(node_to_cpumask[node]))
+#define node_to_cpumask(node)    (node_to_cpumask[node])
+
+#endif  /* _ASM_X86_TOPOLOGY_H */
diff -r a1f987e9640f -r f312c2d01d8b xen/include/xen/nodemask.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/xen/nodemask.h        Wed Oct 25 12:25:54 2006 +0100
@@ -0,0 +1,342 @@
+#ifndef __LINUX_NODEMASK_H
+#define __LINUX_NODEMASK_H
+
+/*
+ * Nodemasks provide a bitmap suitable for representing the
+ * set of Node's in a system, one bit position per Node number.
+ *
+ * See detailed comments in the file linux/bitmap.h describing the
+ * data type on which these nodemasks are based.
+ *
+ * For details of nodemask_scnprintf() and nodemask_parse(),
+ * see bitmap_scnprintf() and bitmap_parse() in lib/bitmap.c.
+ *
+ * The available nodemask operations are:
+ *
+ * void node_set(node, mask)           turn on bit 'node' in mask
+ * void node_clear(node, mask)         turn off bit 'node' in mask
+ * void nodes_setall(mask)             set all bits
+ * void nodes_clear(mask)              clear all bits
+ * int node_isset(node, mask)          true iff bit 'node' set in mask
+ * int node_test_and_set(node, mask)   test and set bit 'node' in mask
+ *
+ * void nodes_and(dst, src1, src2)     dst = src1 & src2  [intersection]
+ * void nodes_or(dst, src1, src2)      dst = src1 | src2  [union]
+ * void nodes_xor(dst, src1, src2)     dst = src1 ^ src2
+ * void nodes_andnot(dst, src1, src2)  dst = src1 & ~src2
+ * void nodes_complement(dst, src)     dst = ~src
+ *
+ * int nodes_equal(mask1, mask2)       Does mask1 == mask2?
+ * int nodes_intersects(mask1, mask2)  Do mask1 and mask2 intersect?
+ * int nodes_subset(mask1, mask2)      Is mask1 a subset of mask2?
+ * int nodes_empty(mask)               Is mask empty (no bits sets)?
+ * int nodes_full(mask)                        Is mask full (all bits sets)?
+ * int nodes_weight(mask)              Hamming weight - number of set bits
+ *
+ * void nodes_shift_right(dst, src, n) Shift right
+ * void nodes_shift_left(dst, src, n)  Shift left
+ *
+ * int first_node(mask)                        Number lowest set bit, or 
MAX_NUMNODES
+ * int next_node(node, mask)           Next node past 'node', or MAX_NUMNODES
+ * int first_unset_node(mask)          First node not set in mask, or 
+ *                                     MAX_NUMNODES.
+ *
+ * nodemask_t nodemask_of_node(node)   Return nodemask with bit 'node' set
+ * NODE_MASK_ALL                       Initializer - all bits set
+ * NODE_MASK_NONE                      Initializer - no bits set
+ * unsigned long *nodes_addr(mask)     Array of unsigned long's in mask
+ *
+ * int nodemask_scnprintf(buf, len, mask) Format nodemask for printing
+ * int nodemask_parse(ubuf, ulen, mask)        Parse ascii string as nodemask
+ *
+ * for_each_node_mask(node, mask)      for-loop node over mask
+ *
+ * int num_online_nodes()              Number of online Nodes
+ * int num_possible_nodes()            Number of all possible Nodes
+ *
+ * int node_online(node)               Is some node online?
+ * int node_possible(node)             Is some node possible?
+ *
+ * int any_online_node(mask)           First online node in mask
+ *
+ * node_set_online(node)               set bit 'node' in node_online_map
+ * node_set_offline(node)              clear bit 'node' in node_online_map
+ *
+ * for_each_node(node)                 for-loop node over node_possible_map
+ * for_each_online_node(node)          for-loop node over node_online_map
+ *
+ * Subtlety:
+ * 1) The 'type-checked' form of node_isset() causes gcc (3.3.2, anyway)
+ *    to generate slightly worse code.  So use a simple one-line #define
+ *    for node_isset(), instead of wrapping an inline inside a macro, the
+ *    way we do the other calls.
+ */
+
+#if 0
+#include <linux/threads.h>
+#include <asm/bug.h>
+#endif
+#include <xen/kernel.h>
+#include <xen/bitmap.h>
+#include <xen/numa.h>
+
+typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
+extern nodemask_t _unused_nodemask_arg_;
+
+#define node_set(node, dst) __node_set((node), &(dst))
+static inline void __node_set(int node, volatile nodemask_t *dstp)
+{
+       set_bit(node, dstp->bits);
+}
+
+#define node_clear(node, dst) __node_clear((node), &(dst))
+static inline void __node_clear(int node, volatile nodemask_t *dstp)
+{
+       clear_bit(node, dstp->bits);
+}
+
+#define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES)
+static inline void __nodes_setall(nodemask_t *dstp, int nbits)
+{
+       bitmap_fill(dstp->bits, nbits);
+}
+
+#define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES)
+static inline void __nodes_clear(nodemask_t *dstp, int nbits)
+{
+       bitmap_zero(dstp->bits, nbits);
+}
+
+/* No static inline type checking - see Subtlety (1) above. */
+#define node_isset(node, nodemask) test_bit((node), (nodemask).bits)
+
+#define node_test_and_set(node, nodemask) \
+                       __node_test_and_set((node), &(nodemask))
+static inline int __node_test_and_set(int node, nodemask_t *addr)
+{
+       return test_and_set_bit(node, addr->bits);
+}
+
+#define nodes_and(dst, src1, src2) \
+                       __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
+static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
+                                       const nodemask_t *src2p, int nbits)
+{
+       bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_or(dst, src1, src2) \
+                       __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES)
+static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
+                                       const nodemask_t *src2p, int nbits)
+{
+       bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_xor(dst, src1, src2) \
+                       __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES)
+static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
+                                       const nodemask_t *src2p, int nbits)
+{
+       bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_andnot(dst, src1, src2) \
+                       __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES)
+static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
+                                       const nodemask_t *src2p, int nbits)
+{
+       bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_complement(dst, src) \
+                       __nodes_complement(&(dst), &(src), MAX_NUMNODES)
+static inline void __nodes_complement(nodemask_t *dstp,
+                                       const nodemask_t *srcp, int nbits)
+{
+       bitmap_complement(dstp->bits, srcp->bits, nbits);
+}
+
+#define nodes_equal(src1, src2) \
+                       __nodes_equal(&(src1), &(src2), MAX_NUMNODES)
+static inline int __nodes_equal(const nodemask_t *src1p,
+                                       const nodemask_t *src2p, int nbits)
+{
+       return bitmap_equal(src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_intersects(src1, src2) \
+                       __nodes_intersects(&(src1), &(src2), MAX_NUMNODES)
+static inline int __nodes_intersects(const nodemask_t *src1p,
+                                       const nodemask_t *src2p, int nbits)
+{
+       return bitmap_intersects(src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_subset(src1, src2) \
+                       __nodes_subset(&(src1), &(src2), MAX_NUMNODES)
+static inline int __nodes_subset(const nodemask_t *src1p,
+                                       const nodemask_t *src2p, int nbits)
+{
+       return bitmap_subset(src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES)
+static inline int __nodes_empty(const nodemask_t *srcp, int nbits)
+{
+       return bitmap_empty(srcp->bits, nbits);
+}
+
+#define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES)
+static inline int __nodes_full(const nodemask_t *srcp, int nbits)
+{
+       return bitmap_full(srcp->bits, nbits);
+}
+
+#define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES)
+static inline int __nodes_weight(const nodemask_t *srcp, int nbits)
+{
+       return bitmap_weight(srcp->bits, nbits);
+}
+
+#define nodes_shift_right(dst, src, n) \
+                       __nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES)
+static inline void __nodes_shift_right(nodemask_t *dstp,
+                                       const nodemask_t *srcp, int n, int 
nbits)
+{
+       bitmap_shift_right(dstp->bits, srcp->bits, n, nbits);
+}
+
+#define nodes_shift_left(dst, src, n) \
+                       __nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES)
+static inline void __nodes_shift_left(nodemask_t *dstp,
+                                       const nodemask_t *srcp, int n, int 
nbits)
+{
+       bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
+}
+
+/* FIXME: better would be to fix all architectures to never return
+          > MAX_NUMNODES, then the silly min_ts could be dropped. */
+
+#define first_node(src) __first_node(&(src))
+static inline int __first_node(const nodemask_t *srcp)
+{
+       return min_t(int, MAX_NUMNODES, find_first_bit(srcp->bits, 
MAX_NUMNODES));
+}
+
+#define next_node(n, src) __next_node((n), &(src))
+static inline int __next_node(int n, const nodemask_t *srcp)
+{
+       return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, 
n+1));
+}
+
+#define nodemask_of_node(node)                                         \
+({                                                                     \
+       typeof(_unused_nodemask_arg_) m;                                \
+       if (sizeof(m) == sizeof(unsigned long)) {                       \
+               m.bits[0] = 1UL<<(node);                                \
+       } else {                                                        \
+               nodes_clear(m);                                         \
+               node_set((node), m);                                    \
+       }                                                               \
+       m;                                                              \
+})
+
+#define first_unset_node(mask) __first_unset_node(&(mask))
+static inline int __first_unset_node(const nodemask_t *maskp)
+{
+       return min_t(int,MAX_NUMNODES,
+                       find_first_zero_bit(maskp->bits, MAX_NUMNODES));
+}
+
+#define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES)
+
+#if MAX_NUMNODES <= BITS_PER_LONG
+
+#define NODE_MASK_ALL                                                  \
+((nodemask_t) { {                                                      \
+       [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD           \
+} })
+
+#else
+
+#define NODE_MASK_ALL                                                  \
+((nodemask_t) { {                                                      \
+       [0 ... BITS_TO_LONGS(MAX_NUMNODES)-2] = ~0UL,                   \
+       [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD           \
+} })
+
+#endif
+
+#define NODE_MASK_NONE                                                 \
+((nodemask_t) { {                                                      \
+       [0 ... BITS_TO_LONGS(MAX_NUMNODES)-1] =  0UL                    \
+} })
+
+#define nodes_addr(src) ((src).bits)
+
+#if 0
+#define nodemask_scnprintf(buf, len, src) \
+                       __nodemask_scnprintf((buf), (len), &(src), MAX_NUMNODES)
+static inline int __nodemask_scnprintf(char *buf, int len,
+                                       const nodemask_t *srcp, int nbits)
+{
+       return bitmap_scnprintf(buf, len, srcp->bits, nbits);
+}
+
+#define nodemask_parse(ubuf, ulen, dst) \
+                       __nodemask_parse((ubuf), (ulen), &(dst), MAX_NUMNODES)
+static inline int __nodemask_parse(const char __user *buf, int len,
+                                       nodemask_t *dstp, int nbits)
+{
+       return bitmap_parse(buf, len, dstp->bits, nbits);
+}
+#endif
+
+#if MAX_NUMNODES > 1
+#define for_each_node_mask(node, mask)                 \
+       for ((node) = first_node(mask);                 \
+               (node) < MAX_NUMNODES;                  \
+               (node) = next_node((node), (mask)))
+#else /* MAX_NUMNODES == 1 */
+#define for_each_node_mask(node, mask)                 \
+       if (!nodes_empty(mask))                         \
+               for ((node) = 0; (node) < 1; (node)++)
+#endif /* MAX_NUMNODES */
+
+/*
+ * The following particular system nodemasks and operations
+ * on them manage all possible and online nodes.
+ */
+
+extern nodemask_t node_online_map;
+extern nodemask_t node_possible_map;
+
+#if MAX_NUMNODES > 1
+#define num_online_nodes()     nodes_weight(node_online_map)
+#define num_possible_nodes()   nodes_weight(node_possible_map)
+#define node_online(node)      node_isset((node), node_online_map)
+#define node_possible(node)    node_isset((node), node_possible_map)
+#else
+#define num_online_nodes()     1
+#define num_possible_nodes()   1
+#define node_online(node)      ((node) == 0)
+#define node_possible(node)    ((node) == 0)
+#endif
+
+#define any_online_node(mask)                  \
+({                                             \
+       int node;                               \
+       for_each_node_mask(node, (mask))        \
+               if (node_online(node))          \
+                       break;                  \
+       node;                                   \
+})
+
+#define node_set_online(node)     set_bit((node), node_online_map.bits)
+#define node_set_offline(node)    clear_bit((node), node_online_map.bits)
+
+#define for_each_node(node)       for_each_node_mask((node), node_possible_map)
+#define for_each_online_node(node) for_each_node_mask((node), node_online_map)
+
+#endif /* __LINUX_NODEMASK_H */
diff -r a1f987e9640f -r f312c2d01d8b xen/include/xen/numa.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/xen/numa.h    Wed Oct 25 12:25:54 2006 +0100
@@ -0,0 +1,35 @@
+#ifndef _XEN_NUMA_H
+#define _XEN_NUMA_H
+
+#include <xen/config.h>
+
+#ifdef CONFIG_DISCONTIGMEM
+#include <asm/numnodes.h>
+#endif
+
+#ifndef NODES_SHIFT
+#define NODES_SHIFT     0
+#endif
+
+#define MAX_NUMNODES    (1 << NODES_SHIFT)
+#define NUMA_NO_NODE    0xff
+
+#define MAX_PXM_DOMAINS    256   /* 1 byte and no promises about values */
+#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
+#define MAX_CHUNKS_PER_NODE   4
+#define MAXCHUNKS    (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
+
+/* needed for drivers/acpi/numa.c */
+#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
+
+extern unsigned int cpu_to_node[];
+#include <xen/cpumask.h>
+extern cpumask_t node_to_cpumask[];
+
+typedef struct node_data {
+    unsigned long node_start_pfn;
+    unsigned long node_spanned_pages;
+    unsigned int  node_id;
+} node_data_t;
+
+#endif /* _XEN_NUMA_H */
diff -r a1f987e9640f -r f312c2d01d8b xen/include/xen/topology.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/xen/topology.h        Wed Oct 25 12:25:54 2006 +0100
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2006, IBM Corp.
+ *
+ * All rights reserved.          
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#ifndef _XEN_TOPOLOGY_H
+#define _XEN_TOPOLOGY_H
+
+#include <asm/topology.h>
+
+#endif /* _XEN_TOPOLOGY_H */

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] [xen-unstable] [XEN] Add basic NUMA/SRAT support to Xen from Linux 2.6.16.29., Xen patchbot-unstable <=