diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 86bef0f..7a24070 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -940,7 +940,8 @@ void __init setup_arch(char **cmdline_p) /* * Parse SRAT to discover nodes. */ - acpi_numa_init(); + if (acpi_numa > 0) + acpi_numa_init(); #endif initmem_init(0, max_pfn); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 459913b..14fa654 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -11,7 +11,9 @@ #include #include #include +#include #include +#include #include #include @@ -19,6 +21,7 @@ #include #include #include +#include struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); @@ -428,7 +431,6 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn */ if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { long n = simple_strtol(cmdline, NULL, 0); - num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); if (num_nodes < 0) return num_nodes; @@ -522,16 +524,162 @@ out: numa_init_array(); return 0; } +struct xen_domain_numa_layout pv_numa_layout; + +void dump_numa_layout(struct xen_domain_numa_layout *layout) +{ + unsigned int i, j; + char vcpumask[128]; + printk("NUMA-LAYOUT(Dom0) : vcpus(%u), vnodes(%u)\n", + layout->max_vcpus, layout->max_vnodes); + for (i = 0; i < layout->max_vnodes; i++) + { + struct xen_vnode_data *vnode_data = &layout->vnode_data[i]; + cpumask_scnprintf(vcpumask, sizeof(vcpumask), + ((cpumask_t *)&vnode_data->vcpu_mask)); + printk("vnode[%u]:mnode(%u), node_nr_pages(%lx), vcpu_mask(%s)\n", + vnode_data->vnode_id, vnode_data->mnode_id, + (unsigned long)vnode_data->nr_pages, vcpumask); + } + + printk("vnode distances :\n"); + for (i = 0; i < layout->max_vnodes; i++) + printk("\tvnode[%u]", i); + for (i = 0; i < layout->max_vnodes; i++) + { + printk("\nvnode[%u]", i); + for (j = 0; j < layout->max_vnodes; j++) + printk("\t%u", layout->vnode_distance[i*layout->max_vnodes + j]); + printk("\n"); + } + return; +} + +static void __init xen_init_slit_table(struct xen_domain_numa_layout *layout) +{ + /* Construct a slit table (using layout->vnode_distance). + * Copy it to acpi_slit. */ + return; +} +/* Distribute the vcpus over the vnodes according to their affinity */ +static void __init xen_init_numa_array(struct xen_domain_numa_layout *layout) +{ + int vcpu, vnode; + + printk(KERN_INFO "xen_numa_init_array - cpu_to_node initialization\n"); + + for (vnode = 0; vnode < layout->max_vnodes; vnode++) + { + struct xen_vnode_data *vnode_data = &layout->vnode_data[vnode]; + cpumask_t vcpu_mask = *((cpumask_t *)&vnode_data->vcpu_mask); + + for (vcpu = 0; vcpu < layout->max_vcpus; vcpu++) + { + if (cpu_isset(vcpu, vcpu_mask)) + { + if (early_cpu_to_node(vcpu) != NUMA_NO_NODE) + { + printk(KERN_INFO "EARLY vcpu[%d] on vnode[%d]\n", + vcpu, early_cpu_to_node(vcpu)); + continue; + } + printk(KERN_INFO "vcpu[%d] on vnode[%d]\n", vcpu, vnode); + numa_set_node(vcpu, vnode); + } + } + } + return; +} + +static int __init xen_numa_emulation(struct xen_domain_numa_layout *layout, + unsigned long start_pfn, unsigned long last_pfn) +{ + int num_vnodes, i; + u64 node_start_addr, node_end_addr, max_addr; + + printk(KERN_INFO "xen_numa_emulation : max_vnodes(%d), max_vcpus(%d)", + layout->max_vnodes, layout->max_vcpus); + dump_numa_layout(layout); + memset(&nodes, 0, sizeof(nodes)); + + num_vnodes = layout->max_vnodes; + BUG_ON(num_vnodes > MAX_NUMNODES); + + max_addr = last_pfn << PAGE_SHIFT; + + node_start_addr = start_pfn << PAGE_SHIFT; + for (i = 0; i < num_vnodes; i++) + { + struct xen_vnode_data *vnode_data = &layout->vnode_data[i]; + u64 node_size = vnode_data->nr_pages << PAGE_SHIFT; + + node_size &= FAKE_NODE_MIN_HASH_MASK; /* 64MB aligned */ + + if (i == (num_vnodes-1)) + node_end_addr = max_addr; + else + { + node_end_addr = node_start_addr + node_size; + while ((node_end_addr - node_start_addr - + e820_hole_size(node_start_addr, node_end_addr)) < node_size) + { + node_end_addr += FAKE_NODE_MIN_SIZE; + if (node_end_addr > max_addr) { + node_end_addr = max_addr; + break; + } + } + } + /* node_start_addr updated inside the function */ + if (setup_node_range(i, nodes, &node_start_addr, + (node_end_addr-node_start_addr), max_addr+1)) + BUG(); + } + + printk(KERN_INFO "XEN domain numa emulation - setup nodes\n"); + + memnode_shift = compute_hash_shift(nodes, num_vnodes, NULL); + if (memnode_shift < 0) { + printk(KERN_ERR "No NUMA hash function found.\n"); + BUG(); + } + /* XXX: Shouldn't be needed because we disabled acpi_numa very early ! */ + /* + * We need to vacate all active ranges that may have been registered by + * SRAT and set acpi_numa to -1 so that srat_disabled() always returns + * true. NUMA emulation has succeeded so we will not scan ACPI nodes. + */ + remove_all_active_ranges(); + + BUG_ON(acpi_numa >= 0); + for_each_node_mask(i, node_possible_map) { + e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, + nodes[i].end >> PAGE_SHIFT); + setup_node_bootmem(i, nodes[i].start, nodes[i].end); + } + xen_init_slit_table(layout); + xen_init_numa_array(layout); + return 0; +} #endif /* CONFIG_NUMA_EMU */ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) { int i; + struct xen_domain_numa_layout *numa_layout = &pv_numa_layout; + + int xen_pv_numa_enabled = numa_layout->max_vnodes; nodes_clear(node_possible_map); nodes_clear(node_online_map); #ifdef CONFIG_NUMA_EMU + if (xen_pv_domain() && xen_pv_numa_enabled) + { + if (!xen_numa_emulation(numa_layout, start_pfn, last_pfn)) + return; + } + if (cmdline && !numa_emulation(start_pfn, last_pfn)) return; nodes_clear(node_possible_map); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ecb9b0d..b020555 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -83,6 +83,8 @@ void *xen_initial_gdt; */ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; +struct xen_domain_numa_layout *HYPERVISOR_domain_numa_layout; + /* * Flag to determine whether vcpu info placement is available on all * VCPUs. We assume it is to start with, and then set it to zero on @@ -1089,6 +1091,7 @@ static void __init xen_setup_stackprotector(void) pv_cpu_ops.load_gdt = xen_load_gdt; } +extern struct xen_domain_numa_layout pv_numa_layout; /* First C function to be called on Xen boot */ asmlinkage void __init xen_start_kernel(void) { @@ -1230,6 +1233,12 @@ asmlinkage void __init xen_start_kernel(void) xen_start_info->console.domU.evtchn = 0; } + { + struct xen_domain_numa_layout *layout = + (void *)((char *)xen_start_info + + xen_start_info->numa_layout_info.info_off); + memcpy(&pv_numa_layout, layout, sizeof(*layout)); + } xen_raw_console_write("about to get started...\n"); /* Start the world */ diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 612f2c9..cb944a2 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -281,6 +281,9 @@ void __init xen_arch_setup(void) printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); disable_acpi(); } + + acpi_numa = -1; + numa_off = 1; #endif /* diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index 812ffd5..d4588fa 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -398,6 +398,53 @@ struct shared_info { }; +#define XEN_NR_CPUS 64 +#if defined(__i386__) +#define XEN_BITS_PER_LONG 32 +#define XEN_BYTES_PER_LONG 4 +#define XEN_LONG_BYTEORDER 2 +#elif defined(__x86_64__) +#define XEN_BITS_PER_LONG 64 +#define XEN_BYTES_PER_LONG 8 +#define XEN_LONG_BYTEORDER 3 +#endif + +/* same as cpumask_t - in xen and even Linux (for now) */ +#define XEN_BITS_TO_LONGS(bits) \ + (((bits)+XEN_BITS_PER_LONG-1)/XEN_BITS_PER_LONG) +#define XEN_DECLARE_BITMAP(name,bits) \ + unsigned long name[XEN_BITS_TO_LONGS(bits)] +struct xen_cpumask{ XEN_DECLARE_BITMAP(bits, XEN_NR_CPUS); }; +#ifndef __XEN__ +typedef struct xen_cpumask xen_cpumask_t; +#endif + +#define XEN_MAX_VNODES 8 +struct xen_vnode_data { + uint32_t vnode_id; + uint32_t mnode_id; + uint64_t nr_pages; + /* XXX: Can we use this in xen<->domain interfaces ? */ + struct xen_cpumask vcpu_mask; /* vnode_to_vcpumask */ +}; +#ifndef __XEN__ +typedef struct xen_vnode_data xen_vnode_data_t; +#endif + +/* NUMA layout for the domain at the time of startup. + * Structure has to fit within a page. */ +struct xen_domain_numa_layout { + uint32_t max_vcpus; + uint32_t max_vnodes; + + /* Only (max_vnodes*max_vnodes) entries are filled */ + uint32_t vnode_distance[XEN_MAX_VNODES * XEN_MAX_VNODES]; + struct xen_vnode_data vnode_data[XEN_MAX_VNODES]; +}; +#ifndef __XEN__ +typedef struct xen_domain_numa_layout xen_domain_numa_layout_t; +#endif + /* * Start-of-day memory layout for the initial domain (DOM0): * 1. The domain is started within contiguous virtual-memory region. @@ -449,6 +496,13 @@ struct start_info { unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */ unsigned long mod_len; /* Size (bytes) of pre-loaded module. */ int8_t cmd_line[MAX_GUEST_CMDLINE]; + /* The pfn range here covers both page table and p->m table frames. */ + unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table. */ + unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table. */ + struct { + uint32_t info_off; /* Offset of console_info struct. */ + uint32_t info_size; /* Size of console_info struct from start.*/ + } numa_layout_info; }; struct dom0_vga_console_info {