WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

Re: [Xen-devel] [PATCH 10/11] [PVOPS] Use enlightenment to setup nodes

To: Dulloor <dulloor@xxxxxxxxx>
Subject: Re: [Xen-devel] [PATCH 10/11] [PVOPS] Use enlightenment to setup nodes
From: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
Date: Mon, 5 Apr 2010 10:16:48 -0400
Cc: xen-devel@xxxxxxxxxxxxxxxxxxx
Delivery-date: Mon, 05 Apr 2010 07:17:35 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
In-reply-to: <k2p940bcfd21004041230wdde55c1fie4647bef37da36b8@xxxxxxxxxxxxxx>
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
References: <k2p940bcfd21004041230wdde55c1fie4647bef37da36b8@xxxxxxxxxxxxxx>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
User-agent: Mutt/1.5.19 (2009-01-05)
It looks as the comments I provided in 
<20100216174900.GB21067@xxxxxxxxxxxxxxxxxxx> were ignored.

http://article.gmane.org/gmane.comp.emulators.xen.devel/78118

Here I am repeating some of them and adding some new ones.
Please address/answer them.

.. snip..

> -     acpi_numa_init();
> +    if (acpi_numa > 0)
> +         acpi_numa_init();

The style guide is to use tabs, not spaces. It looks out-off-place?
Why not just do acpi_numa_init() by itself?


>  #endif
>  
>       initmem_init(0, max_pfn);
> diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
> index 459913b..3a856dc 100644
> --- a/arch/x86/mm/numa_64.c
> +++ b/arch/x86/mm/numa_64.c
> @@ -11,7 +11,11 @@
>  #include <linux/ctype.h>
>  #include <linux/module.h>
>  #include <linux/nodemask.h>
> +#include <linux/cpumask.h>
>  #include <linux/sched.h>
> +#include <linux/bitops.h>
> +#include <xen/interface/xen.h>
> +#include <xen/interface/memory.h>
>  
>  #include <asm/e820.h>
>  #include <asm/proto.h>
> @@ -19,6 +23,8 @@
>  #include <asm/numa.h>
>  #include <asm/acpi.h>
>  #include <asm/k8.h>
> +#include <asm/xen/hypervisor.h>
> +#include <asm/xen/hypercall.h>
>  
>  struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
>  EXPORT_SYMBOL(node_data);
> @@ -428,7 +434,6 @@ static int __init numa_emulation(unsigned long start_pfn, 
> unsigned long last_pfn
>        */
>       if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
>               long n = simple_strtol(cmdline, NULL, 0);
> -

Uhh, why?
>               num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
>               if (num_nodes < 0)
>                       return num_nodes;
> @@ -522,6 +527,246 @@ out:
>       numa_init_array();
>       return 0;
>  }
> +
> +/************************************************************************/
> +#ifdef CONFIG_XEN_NUMA_GUEST

Yikes. Can you move all of this code in it is own file so that there is
no need to sprinkle this with #ifdefs?

> +/* XEN PV GUEST NUMA */
> +struct xen_domain_numa_layout HYPERVISOR_pv_numa_layout;
> +
> +static inline void __init
> +bitmap_byte_to_long(unsigned long *lp, const uint8_t *bp, int nbits)
> +{
> +     /* We may need to pad the final longword with zeroes. */
> +     if (nbits & (BITS_PER_LONG-1))
> +             lp[BITS_TO_LONGS(nbits)-1] = 0;
> +     memcpy(lp, bp, (nbits+7)/8);
> +}
> +
> +static void __init
> +xenctl_cpumask_to_cpumask(cpumask_t *cpumask, struct xenctl_cpumask 
> *xcpumask)
> +{
> +    unsigned int nr_cpus;
> +    uint8_t *bytemap;
> +
> +    bytemap = xcpumask->bits;
> +
> +    nr_cpus =
> +        min_t(unsigned int, XENCTL_NR_CPUS, NR_CPUS);
> +
> +    cpumask_clear(cpumask);
> +    bitmap_byte_to_long(cpus_addr(*cpumask), bytemap, nr_cpus);
> +}
> +
> +static void __init
> +xen_dump_numa_layout(struct xen_domain_numa_layout *layout)
> +{
> +    unsigned int i, j;
> +    char vcpumaskstr[128];
> +    printk("NUMA-LAYOUT : vcpus(%u), vnodes(%u), ",
> +                            layout->max_vcpus, layout->max_vnodes);

No good. You need printk(KERN_DEBUG ..
> +    switch (layout->type)
> +    {
> +        case XEN_DOM_NUMA_CONFINED:
> +            printk("type(CONFINED)\n");
ditoo.
> +            break;
> +        case XEN_DOM_NUMA_SPLIT:
> +            printk("type(SPLIT)\n");
ditto.
> +            break;
> +        case XEN_DOM_NUMA_STRIPED:
> +            printk("type(STRIPED)\n");
ditto.
> +            break;
> +        default:
> +            printk("type(UNDEFINED)\n");

ditto.
> +    }
> +
> +    for (i = 0; i < layout->max_vnodes; i++)
> +    {
> +        cpumask_t vcpu_mask;
> +        struct xenctl_cpumask *xvcpumask;
> +        struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
> +        xvcpumask = &vnode_data->vcpu_mask;
> +        xenctl_cpumask_to_cpumask(&vcpu_mask, xvcpumask);
> +        cpumask_scnprintf(vcpumaskstr, sizeof(vcpumaskstr), &vcpu_mask);
> +        printk("vnode[%u]:mnode(%u), node_nr_pages(%lx), vcpu_mask(%s)\n", 
ditto
> +                vnode_data->vnode_id, vnode_data->mnode_id,
> +                (unsigned long)vnode_data->nr_pages, vcpumaskstr);
> +    }
> +
> +    printk("vnode distances :\n");
ditoo
> +    for (i = 0; i < layout->max_vnodes; i++)
> +        printk("\tvnode[%u]", i);
ditto
> +    for (i = 0; i < layout->max_vnodes; i++)
> +    {
> +        printk("\nvnode[%u]", i);
ditoo
> +        for (j = 0; j < layout->max_vnodes; j++)
> +            printk("\t%u", layout->vnode_distance[i*layout->max_vnodes + j]);
> +        printk("\n");
ditto
> +    }
> +    return;
> +}
> +
> +static void __init xen_init_slit_table(struct xen_domain_numa_layout *layout)
> +{
> +    /* Construct a slit table (using layout->vnode_distance).
> +     * Copy it to acpi_slit. */
> +    return;
> +}
> +
> +/* Distribute the vcpus over the vnodes according to their affinity */
> +static void __init xen_init_numa_array(struct xen_domain_numa_layout *layout)
> +{
> +     int vcpu, vnode;
> +   
> +     printk(KERN_INFO "xen_numa_init_array - cpu_to_node initialization\n");
> +
> +
> +    for (vnode = 0; vnode < layout->max_vnodes; vnode++)
> +    {
> +        struct xen_vnode_data *vnode_data = &layout->vnode_data[vnode];
> +        struct xenctl_cpumask *xvcpu_mask = &vnode_data->vcpu_mask;
> +        cpumask_t vcpu_mask;
> +
> +        xenctl_cpumask_to_cpumask(&vcpu_mask, xvcpu_mask);
> +   
> +        for (vcpu = 0; vcpu < layout->max_vcpus; vcpu++)
> +        {
> +            if (cpu_isset(vcpu, vcpu_mask))
> +            {
> +                if (early_cpu_to_node(vcpu) != NUMA_NO_NODE)
> +                {
> +                    printk(KERN_INFO "EARLY vcpu[%d] on vnode[%d]\n", 
> +                                        vcpu, early_cpu_to_node(vcpu)); 
> +                    continue;
> +                }
> +                printk(KERN_INFO "vcpu[%d] on vnode[%d]\n", vcpu, vnode);
> +                     numa_set_node(vcpu, vnode);
> +            }
> +        }
> +    }
> +    return;
> +}
> +
> +static int __init xen_numa_emulation(struct xen_domain_numa_layout *layout,
> +                            unsigned long start_pfn, unsigned long last_pfn)
> +{
> +     int num_vnodes, i;
> +    u64 node_start_addr, node_end_addr, max_addr;
> +
> +    printk(KERN_INFO "xen_numa_emulation : max_vnodes(%d), max_vcpus(%d)",
> +                                        layout->max_vnodes, 
> layout->max_vcpus);
> +
> +    if (layout->type != XEN_DOM_NUMA_SPLIT)
> +    {
> +        printk(KERN_INFO "xen_numa_emulation : Invalid layout type");
> +        return -1;
> +    }
> +
> +     memset(&nodes, 0, sizeof(nodes));
> +
> +    num_vnodes = layout->max_vnodes;
> +    BUG_ON(num_vnodes > MAX_NUMNODES);
> +
> +    max_addr = last_pfn << PAGE_SHIFT;
> +
> +    node_start_addr = start_pfn << PAGE_SHIFT;
> +    for (i = 0; i < num_vnodes; i++)
> +    {
> +        struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
> +        u64 node_size = vnode_data->nr_pages << PAGE_SHIFT;
> +
> +             node_size &= FAKE_NODE_MIN_HASH_MASK; /* 64MB aligned */
> +
> +             if (i == (num_vnodes-1))
> +                     node_end_addr = max_addr;
> +             else
> +            node_end_addr = node_start_addr + node_size;
> +        /* node_start_addr updated inside the function */
> +        if (setup_node_range(i, nodes, &node_start_addr, 
> +                    (node_end_addr-node_start_addr), max_addr+1))
> +            goto failed;
> +    }
> +
> +     printk(KERN_INFO "XEN domain numa emulation - setup nodes\n");
> +
> +    memnode_shift = compute_hash_shift(nodes, num_vnodes, NULL);
> +    if (memnode_shift < 0) {
> +         printk(KERN_ERR "No NUMA hash function found.\n");
> +        goto failed;
> +    }
> +    /* XXX: Shouldn't be needed because we disabled acpi_numa very early ! */
> +     /*
> +      * We need to vacate all active ranges that may have been registered by
> +      * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
> +      * true.  NUMA emulation has succeeded so we will not scan ACPI nodes.
> +      */
> +     remove_all_active_ranges();
> +
> +    BUG_ON(acpi_numa >= 0);
> +     for_each_node_mask(i, node_possible_map) {
> +             e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
> +                                             nodes[i].end >> PAGE_SHIFT);
> +             setup_node_bootmem(i, nodes[i].start, nodes[i].end);
> +     }
> +    xen_init_slit_table(layout);
> +     xen_init_numa_array(layout);
> +     return 0;
> +failed:
> +    return -1;
> +}
> +
> +static int __init
> +xen_get_domain_numa_layout(struct xen_domain_numa_layout *pv_layout)
> +{
> +    int rc;
> +    struct xenmem_numa_op memop;
> +    memop.cmd = XENMEM_get_domain_numa_layout;
> +    memop.u.dinfo.domid = DOMID_SELF;
> +    memop.u.dinfo.version = XEN_DOM_NUMA_INTERFACE_VERSION;
> +    memop.u.dinfo.bufsize = sizeof(*pv_layout);
> +    set_xen_guest_handle(memop.u.dinfo.buf, pv_layout);
> +
> +    if ((rc = HYPERVISOR_memory_op(XENMEM_numa_op, &memop)))
> +    {
> +        printk(KERN_INFO "XEN NUMA GUEST:xen_get_domain_numa_layout 
> failed\n");
> +        xen_start_info->flags &= ~SIF_NUMA_DOMAIN;
> +        goto done;
> +    }
> +
> +    if (pv_layout->version != XEN_DOM_NUMA_INTERFACE_VERSION)
> +    {
> +        printk(KERN_INFO "XEN NUMA GUEST: version mismatch (disabling)\n");
> +        xen_start_info->flags &= ~SIF_NUMA_DOMAIN;
> +        rc = -1;
> +    }
> +    xen_dump_numa_layout(pv_layout);
> +done:
> +    return rc;
> +}
> +
> +static int __init
> +xen_pv_numa(unsigned long start_pfn, unsigned long last_pfn)
> +{
> +    int rc = 0;
> +    if (!xen_pv_domain() || !(xen_start_info->flags & SIF_NUMA_DOMAIN))
> +    {
> +        rc = -1;
> +         printk(KERN_INFO "xen numa emulation disabled\n");
> +        goto done;

The tabs/spaces don't look right. Can you run this patch through
checkpatch.pl?

> +    }
> +    if ((rc = xen_get_domain_numa_layout(&HYPERVISOR_pv_numa_layout)))
> +        goto done;
> +    rc = xen_numa_emulation(&HYPERVISOR_pv_numa_layout, start_pfn, last_pfn);
> +done:
> +    return rc;
> +}
> +#else
> +static inline int __init
> +xen_pv_numa(unsigned long start_pfn, unsigned long last_pfn)
> +{
> +    return -1;
> +}
> +#endif /* CONFIG_XEN_NUMA_GUEST */
> +/************************************************************************/
>  #endif /* CONFIG_NUMA_EMU */
>  
>  void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
> @@ -532,6 +777,9 @@ void __init initmem_init(unsigned long start_pfn, 
> unsigned long last_pfn)
>       nodes_clear(node_online_map);
>  
>  #ifdef CONFIG_NUMA_EMU
> +    if (!xen_pv_numa(start_pfn, last_pfn))
> +        return;
> +
>       if (cmdline && !numa_emulation(start_pfn, last_pfn))
>               return;
>       nodes_clear(node_possible_map);
> diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
> index 7675f9b..7cf6a4f 100644
> --- a/arch/x86/xen/Kconfig
> +++ b/arch/x86/xen/Kconfig
> @@ -73,3 +73,12 @@ config XEN_PCI_PASSTHROUGH
>         help
>           Enable support for passing PCI devices through to
>        unprivileged domains. (COMPLETELY UNTESTED)
> +
> +config XEN_NUMA_GUEST
> +    bool "Enable support NUMA aware Xen domains"
> +    depends on XEN && X86_64 && NUMA && NUMA_EMU
> +    help
> +        Enable support for NUMA aware Xen domains. With this
> +        option, if the memory for the domain is allocated 
> +        from different memory nodes, the domain is made aware
> +        of this through a Virtual NUMA enlightenment.
> diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
> index df3e84c..2ee9f0b 100644
> --- a/arch/x86/xen/setup.c
> +++ b/arch/x86/xen/setup.c
> @@ -285,6 +285,8 @@ void __init xen_arch_setup(void)
>               printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
>               disable_acpi();
>       }
> +    acpi_numa = -1;
> +    numa_off = 1;
>  #endif
>  
>       /* 
> diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
> index 32ab005..7f5b85e 100644
> --- a/include/xen/interface/memory.h
> +++ b/include/xen/interface/memory.h
> @@ -240,6 +240,102 @@ DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map);
>   */
>  #define XENMEM_machine_memory_map   10
>  
> +/* xen guest numa operations */
> +#define XENMEM_numa_op           15
> +
> +#define XEN_DOM_NUMA_INTERFACE_VERSION  0x00000001
> +#define XENCTL_NR_CPUS 64
> +#define XENCTL_BITS_PER_BYTE 8
> +#define XENCTL_BITS_TO_BYTES(bits) \
> +    (((bits)+XENCTL_BITS_PER_BYTE-1)/XENCTL_BITS_PER_BYTE)
> +
> +#define XENCTL_DECLARE_BITMAP(name,bits) \
> +    uint8_t name[XENCTL_BITS_TO_BYTES(bits)]
> +struct xenctl_cpumask{ XENCTL_DECLARE_BITMAP(bits, XENCTL_NR_CPUS); };
> +
> +/* Not used in guest */
> +#define XENMEM_machine_numa_layout 0x01
> +struct xenmem_node_data {
> +    uint32_t node_id;
> +    uint64_t node_memsize;
> +    uint64_t node_memfree;
> +    struct xenctl_cpumask cpu_mask; /* node_to_cpumask */
> +};
> +
> +/* NUMA layout for the machine.
> + * Structure has to fit within a page. */
> +struct xenmem_machine_numa_layout {
> +    uint32_t max_nodes;
> +    /* Only (max_nodes*max_nodes) entries are filled */
> +    GUEST_HANDLE(uint32_t) node_distance;
> +    /* max_vnodes entries of xenmem_node_data type */
> +    GUEST_HANDLE(void) node_data;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_machine_numa_layout);
> +
> +
> +#define XENMEM_machine_nodemap  0x02
> +struct xenmem_machine_nodemap {
> +    /* On call the size of the available buffer */
> +    uint32_t bufsize;
> +
> +    /* memnode map parameters */
> +    int32_t shift;
> +    uint32_t mapsize;
> +    GUEST_HANDLE(void) map;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_machine_nodemap);
> +
> +/* NUMA layout for the domain at the time of startup.
> + * Structure has to fit within a page. */
> +#define XENMEM_set_domain_numa_layout 0x03
> +#define XENMEM_get_domain_numa_layout 0x04
> +
> +/* NUMA layout for the domain at the time of startup.
> + * Structure has to fit within a page. */
> +#define XEN_MAX_VNODES 8
> +
> +struct xen_vnode_data {
> +    uint32_t vnode_id;
> +    uint32_t mnode_id;
> +    uint64_t nr_pages;
> +    struct xenctl_cpumask vcpu_mask; /* vnode_to_vcpumask */
> +};
> +
> +#define XEN_DOM_NUMA_CONFINED       0x01
> +#define XEN_DOM_NUMA_SPLIT          0x02
> +#define XEN_DOM_NUMA_STRIPED         0x03
> +struct xen_domain_numa_layout {
> +    uint32_t version;
> +    uint32_t type;
> +
> +    uint32_t max_vcpus;
> +    uint32_t max_vnodes;
> +
> +    /* Only (max_vnodes*max_vnodes) entries are filled */
> +    uint32_t vnode_distance[XEN_MAX_VNODES * XEN_MAX_VNODES];
> +    /* Only (max_vnodes) entries are filled */
> +    struct xen_vnode_data vnode_data[XEN_MAX_VNODES];
> +};
> +
> +struct xenmem_domain_numa_layout {
> +    domid_t domid;
> +    uint32_t version;
> +
> +    uint32_t bufsize;
> +    GUEST_HANDLE(void) buf;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_domain_numa_layout);
> +
> +struct xenmem_numa_op {
> +    uint32_t cmd;
> +    union {
> +        struct xenmem_machine_numa_layout minfo;
> +        struct xenmem_machine_nodemap mnodemap;
> +        struct xenmem_domain_numa_layout dinfo;
> +    } u;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_numa_op);
>  
>  /*
>   * Prevent the balloon driver from changing the memory reservation
> diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
> index 9ffaee0..17cb17d 100644
> --- a/include/xen/interface/xen.h
> +++ b/include/xen/interface/xen.h
> @@ -494,6 +494,8 @@ struct dom0_vga_console_info {
>  /* These flags are passed in the 'flags' field of start_info_t. */
>  #define SIF_PRIVILEGED    (1<<0)  /* Is the domain privileged? */
>  #define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
> +#define SIF_MULTIBOOT_MOD (1<<2)  /* Is mod_start a multiboot module? */
> +#define SIF_NUMA_DOMAIN   (1<<3)  /* Is the domain NUMA aware ? */
>  #define SIF_PM_MASK       (0xFF<<8) /* reserve 1 byte for xen-pm options */
>  
>  typedef uint64_t cpumap_t;
> @@ -504,6 +506,7 @@ typedef uint8_t xen_domain_handle_t[16];
>  #define __mk_unsigned_long(x) x ## UL
>  #define mk_unsigned_long(x) __mk_unsigned_long(x)
>  
> +DEFINE_GUEST_HANDLE(uint32_t);
>  DEFINE_GUEST_HANDLE(uint64_t);
>  
>  #else /* __ASSEMBLY__ */

> _______________________________________________
> Xen-devel mailing list
> Xen-devel@xxxxxxxxxxxxxxxxxxx
> http://lists.xensource.com/xen-devel


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

<Prev in Thread] Current Thread [Next in Thread>