> It looks as the comments I provided in
> <20100216174900.GB21067@xxxxxxxxxxxxxxxxxxx> were ignored.
I incorporated a number of your comments, but obviously missed some.
Sorry, wasn't intentional.
Comments inline.
On Mon, Apr 5, 2010 at 10:16 AM, Konrad Rzeszutek Wilk
<konrad.wilk@xxxxxxxxxx> wrote:
>
> It looks as the comments I provided in
> <20100216174900.GB21067@xxxxxxxxxxxxxxxxxxx> were ignored.
>
> http://article.gmane.org/gmane.comp.emulators.xen.devel/78118
>
> Here I am repeating some of them and adding some new ones.
> Please address/answer them.
>
> .. snip..
>
>> - acpi_numa_init();
>> + if (acpi_numa > 0)
>> + acpi_numa_init();
>
> The style guide is to use tabs, not spaces. It looks out-off-place?
I had set the darn "expandtab" set in my vimrc. I have run the patch
through checkpatch.pl and will fix the indentation issues.
> Why not just do acpi_numa_init() by itself?
>
With pv kernels, we know well ahead that numa acpi is irrelevant, same
as when booting linux with noacpi. I thought it is unnecessary to even
parse the numa acpi tables in that case.
>
>> #endif
>>
>> initmem_init(0, max_pfn);
>> diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
>> index 459913b..3a856dc 100644
>> --- a/arch/x86/mm/numa_64.c
>> +++ b/arch/x86/mm/numa_64.c
>> @@ -11,7 +11,11 @@
>> #include <linux/ctype.h>
>> #include <linux/module.h>
>> #include <linux/nodemask.h>
>> +#include <linux/cpumask.h>
>> #include <linux/sched.h>
>> +#include <linux/bitops.h>
>> +#include <xen/interface/xen.h>
>> +#include <xen/interface/memory.h>
>>
>> #include <asm/e820.h>
>> #include <asm/proto.h>
>> @@ -19,6 +23,8 @@
>> #include <asm/numa.h>
>> #include <asm/acpi.h>
>> #include <asm/k8.h>
>> +#include <asm/xen/hypervisor.h>
>> +#include <asm/xen/hypercall.h>
>>
>> struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
>> EXPORT_SYMBOL(node_data);
>> @@ -428,7 +434,6 @@ static int __init numa_emulation(unsigned long
>> start_pfn, unsigned long last_pfn
>> */
>> if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
>> long n = simple_strtol(cmdline, NULL, 0);
>> -
>
> Uhh, why?
By mistake ! :)
>> num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
>> if (num_nodes < 0)
>> return num_nodes;
>> @@ -522,6 +527,246 @@ out:
>> numa_init_array();
>> return 0;
>> }
>> +
>> +/************************************************************************/
>> +#ifdef CONFIG_XEN_NUMA_GUEST
>
> Yikes. Can you move all of this code in it is own file so that there is
> no need to sprinkle this with #ifdefs?
I agree with you. I will also take care of the few xen header files I
have added to the file.
>
>> +/* XEN PV GUEST NUMA */
>> +struct xen_domain_numa_layout HYPERVISOR_pv_numa_layout;
>> +
>> +static inline void __init
>> +bitmap_byte_to_long(unsigned long *lp, const uint8_t *bp, int nbits)
>> +{
>> + /* We may need to pad the final longword with zeroes. */
>> + if (nbits & (BITS_PER_LONG-1))
>> + lp[BITS_TO_LONGS(nbits)-1] = 0;
>> + memcpy(lp, bp, (nbits+7)/8);
>> +}
>> +
>> +static void __init
>> +xenctl_cpumask_to_cpumask(cpumask_t *cpumask, struct xenctl_cpumask
>> *xcpumask)
>> +{
>> + unsigned int nr_cpus;
>> + uint8_t *bytemap;
>> +
>> + bytemap = xcpumask->bits;
>> +
>> + nr_cpus =
>> + min_t(unsigned int, XENCTL_NR_CPUS, NR_CPUS);
>> +
>> + cpumask_clear(cpumask);
>> + bitmap_byte_to_long(cpus_addr(*cpumask), bytemap, nr_cpus);
>> +}
>> +
>> +static void __init
>> +xen_dump_numa_layout(struct xen_domain_numa_layout *layout)
>> +{
>> + unsigned int i, j;
>> + char vcpumaskstr[128];
>> + printk("NUMA-LAYOUT : vcpus(%u), vnodes(%u), ",
>> + layout->max_vcpus, layout->max_vnodes);
>
> No good. You need printk(KERN_DEBUG ..
Missed these from your previous comments. Will take care of these.
>> + switch (layout->type)
>> + {
>> + case XEN_DOM_NUMA_CONFINED:
>> + printk("type(CONFINED)\n");
> ditoo.
>> + break;
>> + case XEN_DOM_NUMA_SPLIT:
>> + printk("type(SPLIT)\n");
> ditto.
>> + break;
>> + case XEN_DOM_NUMA_STRIPED:
>> + printk("type(STRIPED)\n");
> ditto.
>> + break;
>> + default:
>> + printk("type(UNDEFINED)\n");
>
> ditto.
>> + }
>> +
>> + for (i = 0; i < layout->max_vnodes; i++)
>> + {
>> + cpumask_t vcpu_mask;
>> + struct xenctl_cpumask *xvcpumask;
>> + struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
>> + xvcpumask = &vnode_data->vcpu_mask;
>> + xenctl_cpumask_to_cpumask(&vcpu_mask, xvcpumask);
>> + cpumask_scnprintf(vcpumaskstr, sizeof(vcpumaskstr), &vcpu_mask);
>> + printk("vnode[%u]:mnode(%u), node_nr_pages(%lx), vcpu_mask(%s)\n",
> ditto
>> + vnode_data->vnode_id, vnode_data->mnode_id,
>> + (unsigned long)vnode_data->nr_pages, vcpumaskstr);
>> + }
>> +
>> + printk("vnode distances :\n");
> ditoo
>> + for (i = 0; i < layout->max_vnodes; i++)
>> + printk("\tvnode[%u]", i);
> ditto
>> + for (i = 0; i < layout->max_vnodes; i++)
>> + {
>> + printk("\nvnode[%u]", i);
> ditoo
>> + for (j = 0; j < layout->max_vnodes; j++)
>> + printk("\t%u", layout->vnode_distance[i*layout->max_vnodes +
>> j]);
>> + printk("\n");
> ditto
>> + }
>> + return;
>> +}
>> +
>> +static void __init xen_init_slit_table(struct xen_domain_numa_layout
>> *layout)
>> +{
>> + /* Construct a slit table (using layout->vnode_distance).
>> + * Copy it to acpi_slit. */
>> + return;
>> +}
>> +
>> +/* Distribute the vcpus over the vnodes according to their affinity */
>> +static void __init xen_init_numa_array(struct xen_domain_numa_layout
>> *layout)
>> +{
>> + int vcpu, vnode;
>> +
>> + printk(KERN_INFO "xen_numa_init_array - cpu_to_node initialization\n");
>> +
>> +
>> + for (vnode = 0; vnode < layout->max_vnodes; vnode++)
>> + {
>> + struct xen_vnode_data *vnode_data = &layout->vnode_data[vnode];
>> + struct xenctl_cpumask *xvcpu_mask = &vnode_data->vcpu_mask;
>> + cpumask_t vcpu_mask;
>> +
>> + xenctl_cpumask_to_cpumask(&vcpu_mask, xvcpu_mask);
>> +
>> + for (vcpu = 0; vcpu < layout->max_vcpus; vcpu++)
>> + {
>> + if (cpu_isset(vcpu, vcpu_mask))
>> + {
>> + if (early_cpu_to_node(vcpu) != NUMA_NO_NODE)
>> + {
>> + printk(KERN_INFO "EARLY vcpu[%d] on vnode[%d]\n",
>> + vcpu, early_cpu_to_node(vcpu));
>> + continue;
>> + }
>> + printk(KERN_INFO "vcpu[%d] on vnode[%d]\n", vcpu, vnode);
>> + numa_set_node(vcpu, vnode);
>> + }
>> + }
>> + }
>> + return;
>> +}
>> +
>> +static int __init xen_numa_emulation(struct xen_domain_numa_layout *layout,
>> + unsigned long start_pfn, unsigned long last_pfn)
>> +{
>> + int num_vnodes, i;
>> + u64 node_start_addr, node_end_addr, max_addr;
>> +
>> + printk(KERN_INFO "xen_numa_emulation : max_vnodes(%d), max_vcpus(%d)",
>> + layout->max_vnodes,
>> layout->max_vcpus);
>> +
>> + if (layout->type != XEN_DOM_NUMA_SPLIT)
>> + {
>> + printk(KERN_INFO "xen_numa_emulation : Invalid layout type");
>> + return -1;
>> + }
>> +
>> + memset(&nodes, 0, sizeof(nodes));
>> +
>> + num_vnodes = layout->max_vnodes;
>> + BUG_ON(num_vnodes > MAX_NUMNODES);
>> +
>> + max_addr = last_pfn << PAGE_SHIFT;
>> +
>> + node_start_addr = start_pfn << PAGE_SHIFT;
>> + for (i = 0; i < num_vnodes; i++)
>> + {
>> + struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
>> + u64 node_size = vnode_data->nr_pages << PAGE_SHIFT;
>> +
>> + node_size &= FAKE_NODE_MIN_HASH_MASK; /* 64MB aligned */
>> +
>> + if (i == (num_vnodes-1))
>> + node_end_addr = max_addr;
>> + else
>> + node_end_addr = node_start_addr + node_size;
>> + /* node_start_addr updated inside the function */
>> + if (setup_node_range(i, nodes, &node_start_addr,
>> + (node_end_addr-node_start_addr), max_addr+1))
>> + goto failed;
>> + }
>> +
>> + printk(KERN_INFO "XEN domain numa emulation - setup nodes\n");
>> +
>> + memnode_shift = compute_hash_shift(nodes, num_vnodes, NULL);
>> + if (memnode_shift < 0) {
>> + printk(KERN_ERR "No NUMA hash function found.\n");
>> + goto failed;
>> + }
>> + /* XXX: Shouldn't be needed because we disabled acpi_numa very early !
>> */
>> + /*
>> + * We need to vacate all active ranges that may have been registered by
>> + * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
>> + * true. NUMA emulation has succeeded so we will not scan ACPI nodes.
>> + */
>> + remove_all_active_ranges();
>> +
>> + BUG_ON(acpi_numa >= 0);
>> + for_each_node_mask(i, node_possible_map) {
>> + e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
>> + nodes[i].end >> PAGE_SHIFT);
>> + setup_node_bootmem(i, nodes[i].start, nodes[i].end);
>> + }
>> + xen_init_slit_table(layout);
>> + xen_init_numa_array(layout);
>> + return 0;
>> +failed:
>> + return -1;
>> +}
>> +
>> +static int __init
>> +xen_get_domain_numa_layout(struct xen_domain_numa_layout *pv_layout)
>> +{
>> + int rc;
>> + struct xenmem_numa_op memop;
>> + memop.cmd = XENMEM_get_domain_numa_layout;
>> + memop.u.dinfo.domid = DOMID_SELF;
>> + memop.u.dinfo.version = XEN_DOM_NUMA_INTERFACE_VERSION;
>> + memop.u.dinfo.bufsize = sizeof(*pv_layout);
>> + set_xen_guest_handle(memop.u.dinfo.buf, pv_layout);
>> +
>> + if ((rc = HYPERVISOR_memory_op(XENMEM_numa_op, &memop)))
>> + {
>> + printk(KERN_INFO "XEN NUMA GUEST:xen_get_domain_numa_layout
>> failed\n");
>> + xen_start_info->flags &= ~SIF_NUMA_DOMAIN;
>> + goto done;
>> + }
>> +
>> + if (pv_layout->version != XEN_DOM_NUMA_INTERFACE_VERSION)
>> + {
>> + printk(KERN_INFO "XEN NUMA GUEST: version mismatch (disabling)\n");
>> + xen_start_info->flags &= ~SIF_NUMA_DOMAIN;
>> + rc = -1;
>> + }
>> + xen_dump_numa_layout(pv_layout);
>> +done:
>> + return rc;
>> +}
>> +
>> +static int __init
>> +xen_pv_numa(unsigned long start_pfn, unsigned long last_pfn)
>> +{
>> + int rc = 0;
>> + if (!xen_pv_domain() || !(xen_start_info->flags & SIF_NUMA_DOMAIN))
>> + {
>> + rc = -1;
>> + printk(KERN_INFO "xen numa emulation disabled\n");
>> + goto done;
>
> The tabs/spaces don't look right. Can you run this patch through
> checkpatch.pl?
Ok.
>
>> + }
>> + if ((rc = xen_get_domain_numa_layout(&HYPERVISOR_pv_numa_layout)))
>> + goto done;
>> + rc = xen_numa_emulation(&HYPERVISOR_pv_numa_layout, start_pfn,
>> last_pfn);
>> +done:
>> + return rc;
>> +}
>> +#else
>> +static inline int __init
>> +xen_pv_numa(unsigned long start_pfn, unsigned long last_pfn)
>> +{
>> + return -1;
>> +}
>> +#endif /* CONFIG_XEN_NUMA_GUEST */
>> +/************************************************************************/
>> #endif /* CONFIG_NUMA_EMU */
>>
>> void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
>> @@ -532,6 +777,9 @@ void __init initmem_init(unsigned long start_pfn,
>> unsigned long last_pfn)
>> nodes_clear(node_online_map);
>>
>> #ifdef CONFIG_NUMA_EMU
>> + if (!xen_pv_numa(start_pfn, last_pfn))
>> + return;
>> +
>> if (cmdline && !numa_emulation(start_pfn, last_pfn))
>> return;
>> nodes_clear(node_possible_map);
>> diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
>> index 7675f9b..7cf6a4f 100644
>> --- a/arch/x86/xen/Kconfig
>> +++ b/arch/x86/xen/Kconfig
>> @@ -73,3 +73,12 @@ config XEN_PCI_PASSTHROUGH
>> help
>> Enable support for passing PCI devices through to
>> unprivileged domains. (COMPLETELY UNTESTED)
>> +
>> +config XEN_NUMA_GUEST
>> + bool "Enable support NUMA aware Xen domains"
>> + depends on XEN && X86_64 && NUMA && NUMA_EMU
>> + help
>> + Enable support for NUMA aware Xen domains. With this
>> + option, if the memory for the domain is allocated
>> + from different memory nodes, the domain is made aware
>> + of this through a Virtual NUMA enlightenment.
>> diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
>> index df3e84c..2ee9f0b 100644
>> --- a/arch/x86/xen/setup.c
>> +++ b/arch/x86/xen/setup.c
>> @@ -285,6 +285,8 @@ void __init xen_arch_setup(void)
>> printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
>> disable_acpi();
>> }
>> + acpi_numa = -1;
>> + numa_off = 1;
>> #endif
>>
>> /*
>> diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
>> index 32ab005..7f5b85e 100644
>> --- a/include/xen/interface/memory.h
>> +++ b/include/xen/interface/memory.h
>> @@ -240,6 +240,102 @@ DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map);
>> */
>> #define XENMEM_machine_memory_map 10
>>
>> +/* xen guest numa operations */
>> +#define XENMEM_numa_op 15
>> +
>> +#define XEN_DOM_NUMA_INTERFACE_VERSION 0x00000001
>> +#define XENCTL_NR_CPUS 64
>> +#define XENCTL_BITS_PER_BYTE 8
>> +#define XENCTL_BITS_TO_BYTES(bits) \
>> + (((bits)+XENCTL_BITS_PER_BYTE-1)/XENCTL_BITS_PER_BYTE)
>> +
>> +#define XENCTL_DECLARE_BITMAP(name,bits) \
>> + uint8_t name[XENCTL_BITS_TO_BYTES(bits)]
>> +struct xenctl_cpumask{ XENCTL_DECLARE_BITMAP(bits, XENCTL_NR_CPUS); };
>> +
>> +/* Not used in guest */
>> +#define XENMEM_machine_numa_layout 0x01
>> +struct xenmem_node_data {
>> + uint32_t node_id;
>> + uint64_t node_memsize;
>> + uint64_t node_memfree;
>> + struct xenctl_cpumask cpu_mask; /* node_to_cpumask */
>> +};
>> +
>> +/* NUMA layout for the machine.
>> + * Structure has to fit within a page. */
>> +struct xenmem_machine_numa_layout {
>> + uint32_t max_nodes;
>> + /* Only (max_nodes*max_nodes) entries are filled */
>> + GUEST_HANDLE(uint32_t) node_distance;
>> + /* max_vnodes entries of xenmem_node_data type */
>> + GUEST_HANDLE(void) node_data;
>> +};
>> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_machine_numa_layout);
>> +
>> +
>> +#define XENMEM_machine_nodemap 0x02
>> +struct xenmem_machine_nodemap {
>> + /* On call the size of the available buffer */
>> + uint32_t bufsize;
>> +
>> + /* memnode map parameters */
>> + int32_t shift;
>> + uint32_t mapsize;
>> + GUEST_HANDLE(void) map;
>> +};
>> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_machine_nodemap);
>> +
>> +/* NUMA layout for the domain at the time of startup.
>> + * Structure has to fit within a page. */
>> +#define XENMEM_set_domain_numa_layout 0x03
>> +#define XENMEM_get_domain_numa_layout 0x04
>> +
>> +/* NUMA layout for the domain at the time of startup.
>> + * Structure has to fit within a page. */
>> +#define XEN_MAX_VNODES 8
>> +
>> +struct xen_vnode_data {
>> + uint32_t vnode_id;
>> + uint32_t mnode_id;
>> + uint64_t nr_pages;
>> + struct xenctl_cpumask vcpu_mask; /* vnode_to_vcpumask */
>> +};
>> +
>> +#define XEN_DOM_NUMA_CONFINED 0x01
>> +#define XEN_DOM_NUMA_SPLIT 0x02
>> +#define XEN_DOM_NUMA_STRIPED 0x03
>> +struct xen_domain_numa_layout {
>> + uint32_t version;
>> + uint32_t type;
>> +
>> + uint32_t max_vcpus;
>> + uint32_t max_vnodes;
>> +
>> + /* Only (max_vnodes*max_vnodes) entries are filled */
>> + uint32_t vnode_distance[XEN_MAX_VNODES * XEN_MAX_VNODES];
>> + /* Only (max_vnodes) entries are filled */
>> + struct xen_vnode_data vnode_data[XEN_MAX_VNODES];
>> +};
>> +
>> +struct xenmem_domain_numa_layout {
>> + domid_t domid;
>> + uint32_t version;
>> +
>> + uint32_t bufsize;
>> + GUEST_HANDLE(void) buf;
>> +};
>> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_domain_numa_layout);
>> +
>> +struct xenmem_numa_op {
>> + uint32_t cmd;
>> + union {
>> + struct xenmem_machine_numa_layout minfo;
>> + struct xenmem_machine_nodemap mnodemap;
>> + struct xenmem_domain_numa_layout dinfo;
>> + } u;
>> +};
>> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_numa_op);
>>
>> /*
>> * Prevent the balloon driver from changing the memory reservation
>> diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
>> index 9ffaee0..17cb17d 100644
>> --- a/include/xen/interface/xen.h
>> +++ b/include/xen/interface/xen.h
>> @@ -494,6 +494,8 @@ struct dom0_vga_console_info {
>> /* These flags are passed in the 'flags' field of start_info_t. */
>> #define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */
>> #define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */
>> +#define SIF_MULTIBOOT_MOD (1<<2) /* Is mod_start a multiboot module? */
>> +#define SIF_NUMA_DOMAIN (1<<3) /* Is the domain NUMA aware ? */
>> #define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */
>>
>> typedef uint64_t cpumap_t;
>> @@ -504,6 +506,7 @@ typedef uint8_t xen_domain_handle_t[16];
>> #define __mk_unsigned_long(x) x ## UL
>> #define mk_unsigned_long(x) __mk_unsigned_long(x)
>>
>> +DEFINE_GUEST_HANDLE(uint32_t);
>> DEFINE_GUEST_HANDLE(uint64_t);
>>
>> #else /* __ASSEMBLY__ */
>
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@xxxxxxxxxxxxxxxxxxx
>> http://lists.xensource.com/xen-devel
>
>
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|