WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

Re: [xen-devel][vNUMA v2][PATCH 7/8] Construct SRAT/SLIT for NUMA HVM

Dulloor wrote:
Construct SRAT/SLIT tables for HVM NUMA.


--- a/tools/firmware/hvmloader/acpi/acpi2_0.h
+++ b/tools/firmware/hvmloader/acpi/acpi2_0.h

+struct acpi_30_srat_mem_affinity {
+    uint8_t type;
+    uint8_t length;
+    uint32_t proximity_domain;
+    uint16_t reserved;         /* Reserved, must be zero */
+    uint64_t base_address;
+    uint64_t size;
As the ACPI specs talks about 32bit members only, I'd rather
see this reflected here. In experiments I saw strange bugs due to the fact that hvmloader is a self contained 32bit binary, which lacks some runtime 64bit functionality (like 64 by 32 bit division). Beside that it would make the whole code endianess aware, though this is possibly of limited use for nowaday's Xen ;-)

+    uint32_t reserved1;
+    uint32_t flags;
+    uint64_t reserved2;            /* Reserved, must be zero */
+};

--- a/tools/firmware/hvmloader/acpi/build.c
+++ b/tools/firmware/hvmloader/acpi/build.c
...
+static int +construct_srat_mem_affinity(struct acpi_30_srat_mem_affinity *mem_srat)
+{
+    int vnode;
+    struct acpi_30_srat_mem_affinity *mem_srat_iter = mem_srat;
+    struct xen_domain_numa_info *numa_info = &hvm_info->numa_info[0];
+    struct xen_vnode_info *numa_vnode_info = NUMA_INFO_VNODE_INFO(numa_info);
+
+    for ( vnode = 0; vnode < numa_info->nr_vnodes; vnode++ )
+    {
+        struct xen_vnode_info *vnode_info = &numa_vnode_info[vnode];
+        memset(mem_srat_iter, 0, sizeof(*mem_srat_iter));
+        mem_srat_iter->type = ACPI_30_SRAT_TYPE_MEMORY_AFFINITY;
+        mem_srat_iter->length = sizeof(*mem_srat_iter);
+        mem_srat_iter->proximity_domain = vnode;
+        mem_srat_iter->base_address = (uint64_t)vnode_info->start << 
PAGE_SHIFT;
+ mem_srat_iter->size = + (uint64_t)(vnode_info->end - vnode_info->start) << PAGE_SHIFT;
+        mem_srat_iter->flags = ACPI_30_SRAT_MEM_ENABLED;
+        mem_srat_iter++;
+    }
+    /* return length of the sub-table */
+    return ((uint8_t *)mem_srat_iter-(uint8_t *)mem_srat);
+}
This approach will lead to possible problems. Although you do account
for the holes in libxc, the SRATs lacks them, leading to one node
possibly having a larger amount of memory than there actually is (increased by the size of the PCI hole). Linux seems to cope with this, but I'd rather see the PCI memory left out of SRAT. Actually this memory mapped region does not need to belong to that special node, but it should be attributed to the processor containing the I/O link. I don't think that we should model this in Xen, though, it would probably be over-engineered and wouldn't apply to the guest anyway (in case of PV I/O).
I have already ported my older code over to this, it works better and
matches SRAT tables I have seen on real machines (although AMD only).

--- a/tools/libxc/xc_hvm_build.c
+++ b/tools/libxc/xc_hvm_build.c
@@ -11,6 +11,7 @@
 #include "xg_private.h"
 #include "xc_private.h"
 #include "xc_dom_numa.h"
+#include "xc_cpumap.h"
#include <xen/foreign/x86_32.h>
 #include <xen/foreign/x86_64.h>
@@ -32,7 +33,62 @@
 #define NR_SPECIAL_PAGES     4
 #define special_pfn(x) (0xff000u - NR_SPECIAL_PAGES + (x))
-static void build_hvm_info(void *hvm_info_page, uint64_t mem_size) +static int build_hvm_numa_info(struct hvm_info_table *hvm_info, + xc_domain_numa_layout_t *dlayout)
+{
+    int i, j;
+    uint64_t vnode_pgstart;
+    struct xen_domain_numa_info *ninfo;
+    struct xen_vnode_info *ninfo_vnode_info;
+    uint8_t *ninfo_vcpu_to_vnode, *ninfo_vnode_distance;
+
+    ninfo = &hvm_info->numa_info[0];
+    ninfo->version = dlayout->version;
+    ninfo->type = dlayout->type;
+    ninfo->nr_vcpus = dlayout->nr_vcpus;
+    ninfo->nr_vnodes = dlayout->nr_vnodes;
+
+    ninfo_vnode_info = NUMA_INFO_VNODE_INFO(ninfo);
+    ninfo_vcpu_to_vnode = NUMA_INFO_VCPU_TO_VNODE(ninfo);
+    ninfo_vnode_distance = NUMA_INFO_VNODE_DISTANCE(ninfo);
+
+       for (i=0; i<ninfo->nr_vcpus; i++)
+               ninfo_vcpu_to_vnode[i] = XEN_INVALID_NODE;
+
+    for (i=0, vnode_pgstart=0; i<dlayout->nr_vnodes; i++)
+    {
+        uint64_t vnode_pgend;
+               struct xenctl_cpumap vnode_vcpumap;
+        xc_vnode_data_t *vnode_data = &dlayout->vnode_data[i];
+               xc_cpumask_t *vnode_vcpumask = &vnode_data->vcpu_mask;
+        struct xen_vnode_info *vnode_info = &ninfo_vnode_info[i];
+
+        vnode_info->mnode_id = vnode_data->mnode_id;
+        vnode_pgend = vnode_pgstart + vnode_data->nr_pages;
+        /* Account for hole in the memory map */
+ if ( (vnode_pgstart < hvm_info->low_mem_pgend) && + (vnode_pgend >= hvm_info->low_mem_pgend) )
I think this is wrong. On guests with less than 4 GB of memory this leads to the last node containing more memory, although it does not touch the PCI hole. It should look like this:
+        if ( (vnode_pgstart < HVM_BELOW_4G_RAM_END) &&
+                            (vnode_pgend >= HVM_BELOW_4G_RAM_END) )
+                vnode_pgend += ((1ull<<32) - HVM_BELOW_4G_RAM_END)>>PAGE_SHIFT;

Regards,
Andre.

--
Andre Przywara
AMD-Operating System Research Center (OSRC), Dresden, Germany
Tel: +49 351 448-3567-12


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

<Prev in Thread] Current Thread [Next in Thread>