Re: [xen-devel][vNUMA v2][PATCH 7/8] Construct SRAT/SLIT for NUM

Dulloor wrote:

Construct SRAT/SLIT tables for HVM NUMA.

--- a/tools/firmware/hvmloader/acpi/acpi2_0.h
+++ b/tools/firmware/hvmloader/acpi/acpi2_0.h

+struct acpi_30_srat_mem_affinity {
+    uint8_t type;
+    uint8_t length;
+    uint32_t proximity_domain;
+    uint16_t reserved;         /* Reserved, must be zero */
+    uint64_t base_address;
+    uint64_t size;

As the ACPI specs talks about 32bit members only, I'd rather

see this reflected here. In experiments I saw strange bugs due to thefact that hvmloader is a self contained 32bit binary, which lacks someruntime 64bit functionality (like 64 by 32 bit division). Beside that itwould make the whole code endianess aware, though this is possibly oflimited use for nowaday's Xen ;-)

+    uint32_t reserved1;
+    uint32_t flags;
+    uint64_t reserved2;            /* Reserved, must be zero */
+};

--- a/tools/firmware/hvmloader/acpi/build.c
+++ b/tools/firmware/hvmloader/acpi/build.c
...

+static int+construct_srat_mem_affinity(struct acpi_30_srat_mem_affinity *mem_srat)

+{
+    int vnode;
+    struct acpi_30_srat_mem_affinity *mem_srat_iter = mem_srat;
+    struct xen_domain_numa_info *numa_info = &hvm_info->numa_info[0];
+    struct xen_vnode_info *numa_vnode_info = NUMA_INFO_VNODE_INFO(numa_info);
+
+    for ( vnode = 0; vnode < numa_info->nr_vnodes; vnode++ )
+    {
+        struct xen_vnode_info *vnode_info = &numa_vnode_info[vnode];
+        memset(mem_srat_iter, 0, sizeof(*mem_srat_iter));
+        mem_srat_iter->type = ACPI_30_SRAT_TYPE_MEMORY_AFFINITY;
+        mem_srat_iter->length = sizeof(*mem_srat_iter);
+        mem_srat_iter->proximity_domain = vnode;
+        mem_srat_iter->base_address = (uint64_t)vnode_info->start << 
PAGE_SHIFT;

+ mem_srat_iter->size =+ (uint64_t)(vnode_info->end - vnode_info->start) << PAGE_SHIFT;

+        mem_srat_iter->flags = ACPI_30_SRAT_MEM_ENABLED;
+        mem_srat_iter++;
+    }
+    /* return length of the sub-table */
+    return ((uint8_t *)mem_srat_iter-(uint8_t *)mem_srat);
+}

This approach will lead to possible problems. Although you do account
for the holes in libxc, the SRATs lacks them, leading to one node

possibly having a larger amount of memory than there actually is(increased by the size of the PCI hole). Linux seems to cope with this,but I'd rather see the PCI memory left out of SRAT. Actually this memorymapped region does not need to belong to that special node, but itshould be attributed to the processor containing the I/O link. I don'tthink that we should model this in Xen, though, it would probably beover-engineered and wouldn't apply to the guest anyway (in case of PV I/O).

I have already ported my older code over to this, it works better and
matches SRAT tables I have seen on real machines (although AMD only).

--- a/tools/libxc/xc_hvm_build.c
+++ b/tools/libxc/xc_hvm_build.c
@@ -11,6 +11,7 @@
 #include "xg_private.h"
 #include "xc_private.h"
 #include "xc_dom_numa.h"
+#include "xc_cpumap.h"

#include <xen/foreign/x86_32.h>

 #include <xen/foreign/x86_64.h>
@@ -32,7 +33,62 @@
 #define NR_SPECIAL_PAGES     4
 #define special_pfn(x) (0xff000u - NR_SPECIAL_PAGES + (x))

-static void build_hvm_info(void *hvm_info_page, uint64_t mem_size)+static int build_hvm_numa_info(struct hvm_info_table *hvm_info,+ xc_domain_numa_layout_t *dlayout)

+{
+    int i, j;
+    uint64_t vnode_pgstart;
+    struct xen_domain_numa_info *ninfo;
+    struct xen_vnode_info *ninfo_vnode_info;
+    uint8_t *ninfo_vcpu_to_vnode, *ninfo_vnode_distance;
+
+    ninfo = &hvm_info->numa_info[0];
+    ninfo->version = dlayout->version;
+    ninfo->type = dlayout->type;
+    ninfo->nr_vcpus = dlayout->nr_vcpus;
+    ninfo->nr_vnodes = dlayout->nr_vnodes;
+
+    ninfo_vnode_info = NUMA_INFO_VNODE_INFO(ninfo);
+    ninfo_vcpu_to_vnode = NUMA_INFO_VCPU_TO_VNODE(ninfo);
+    ninfo_vnode_distance = NUMA_INFO_VNODE_DISTANCE(ninfo);
+
+       for (i=0; i<ninfo->nr_vcpus; i++)
+               ninfo_vcpu_to_vnode[i] = XEN_INVALID_NODE;
+
+    for (i=0, vnode_pgstart=0; i<dlayout->nr_vnodes; i++)
+    {
+        uint64_t vnode_pgend;
+               struct xenctl_cpumap vnode_vcpumap;
+        xc_vnode_data_t *vnode_data = &dlayout->vnode_data[i];
+               xc_cpumask_t *vnode_vcpumask = &vnode_data->vcpu_mask;
+        struct xen_vnode_info *vnode_info = &ninfo_vnode_info[i];
+
+        vnode_info->mnode_id = vnode_data->mnode_id;
+        vnode_pgend = vnode_pgstart + vnode_data->nr_pages;
+        /* Account for hole in the memory map */

+ if ( (vnode_pgstart < hvm_info->low_mem_pgend) &&+ (vnode_pgend >= hvm_info->low_mem_pgend) )

I think this is wrong. On guests with less than 4 GB of memory thisleads to the last node containing more memory, although it does nottouch the PCI hole. It should look like this:

+        if ( (vnode_pgstart < HVM_BELOW_4G_RAM_END) &&
+                            (vnode_pgend >= HVM_BELOW_4G_RAM_END) )

+                vnode_pgend += ((1ull<<32) - HVM_BELOW_4G_RAM_END)>>PAGE_SHIFT;


Regards,
Andre.

--
Andre Przywara
AMD-Operating System Research Center (OSRC), Dresden, Germany
Tel: +49 351 448-3567-12


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

WARNING - OLD ARCHIVES

xen-devel

Re: [xen-devel][vNUMA v2][PATCH 7/8] Construct SRAT/SLIT for NUMA HVM