WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH 4/4] hvm: NUMA guest: inject NUMA topology into the g

To: xen-devel@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-devel] [PATCH 4/4] hvm: NUMA guest: inject NUMA topology into the guest
From: Andre Przywara <andre.przywara@xxxxxxx>
Date: Fri, 4 Jul 2008 10:02:49 +0200
Delivery-date: Fri, 04 Jul 2008 01:03:36 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
User-agent: Thunderbird 1.5.0.10 (X11/20070409)
This patch extends the hvm_info_table to store the number of guest nodes
and will create a suitable ACPI SRAT table to describe the used guest
NUMA topology.

Signed-off-by: Andre Przywara <andre.przywara@xxxxxxx>

Regards,
Andre.

--
Andre Przywara
AMD-Operating System Research Center (OSRC), Dresden, Germany
Tel: +49 351 277-84917
----to satisfy European Law for business letters:
AMD Saxony Limited Liability Company & Co. KG,
Wilschdorfer Landstr. 101, 01109 Dresden, Germany
Register Court Dresden: HRA 4896, General Partner authorized
to represent: AMD Saxony LLC (Wilmington, Delaware, US)
General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy
# HG changeset patch
# User Andre Przywara <andre.przywara@xxxxxxx>
# Date 1215084035 -7200
# Node ID aa69281c1ecf288c729a9fb5aaab1fa0983072bb
# Parent  b84c5f2fe83bd7c94ed956ba412689e614177f5c
advertise NUMA topology to the guest (via an ACPI table)

diff -r b84c5f2fe83b -r aa69281c1ecf tools/firmware/hvmloader/acpi/acpi2_0.h
--- a/tools/firmware/hvmloader/acpi/acpi2_0.h   Thu Jul 03 13:17:11 2008 +0200
+++ b/tools/firmware/hvmloader/acpi/acpi2_0.h   Thu Jul 03 13:20:35 2008 +0200
@@ -356,6 +356,61 @@
 };
 
 /*
+ * System Resource Affinity Table header definition (SRAT).
+ */
+struct acpi_20_srat {
+    struct acpi_header header;
+    uint32_t table_revision;
+    uint32_t reserved2[2];
+};
+
+#define ACPI_SRAT_TABLE_REVISION 1
+
+/*
+ * System Resource Affinity Table structure types.
+ */
+#define ACPI_PROCESSOR_AFFIN           0x00
+#define ACPI_MEMORY_AFFIN              0x01
+
+struct acpi_20_srat_processor {
+    uint8_t type;
+    uint8_t length;
+    uint8_t domain;
+    uint8_t apic_id;
+    uint32_t flags;
+    uint8_t sapic_id;
+    uint8_t domain_hi[3];
+    uint32_t reserved;
+};
+
+/*
+ * Local APIC Affinity Flags.  All other bits are reserved and must be 0.
+ */
+#define ACPI_LOCAL_APIC_AFFIN_ENABLED (1 << 0)
+
+struct acpi_20_srat_memory {
+    uint8_t type;
+    uint8_t length;
+    uint8_t domain;
+    uint8_t domain_hi[3];      /* this is ACPI 3.0, reserved in 2.0 */
+    uint16_t reserved;
+    uint32_t base_address_lo;
+    uint32_t base_address_hi;
+    uint32_t length_lo;
+    uint32_t length_hi;
+    uint32_t reserved2;
+    uint32_t flags;
+    uint32_t reserved3[2];
+};
+
+/*
+ * Memory Affinity Flags.  All other bits are reserved and must be 0.
+ */
+#define ACPI_MEM_AFFIN_ENABLED (1 << 0)
+#define ACPI_MEM_AFFIN_HOTPLUGGABLE (1 << 1)
+#define ACPI_MEM_AFFIN_NONVOLATILE (1 << 2)  /* this is ACPI 3.0 */
+
+/*
  * Table Signatures.
  */
 #define ACPI_2_0_RSDP_SIGNATURE ASCII64('R','S','D',' ','P','T','R',' ')
@@ -366,6 +421,7 @@
 #define ACPI_2_0_XSDT_SIGNATURE ASCII32('X','S','D','T')
 #define ACPI_2_0_TCPA_SIGNATURE ASCII32('T','C','P','A')
 #define ACPI_2_0_HPET_SIGNATURE ASCII32('H','P','E','T')
+#define ACPI_2_0_SRAT_SIGNATURE ASCII32('S','R','A','T')
 
 /*
  * Table revision numbers.
@@ -378,6 +434,7 @@
 #define ACPI_2_0_TCPA_REVISION 0x02
 #define ACPI_2_0_HPET_REVISION 0x01
 #define ACPI_1_0_FADT_REVISION 0x01
+#define ACPI_2_0_SRAT_REVISION 0x01
 
 #pragma pack ()
 
diff -r b84c5f2fe83b -r aa69281c1ecf tools/firmware/hvmloader/acpi/build.c
--- a/tools/firmware/hvmloader/acpi/build.c     Thu Jul 03 13:17:11 2008 +0200
+++ b/tools/firmware/hvmloader/acpi/build.c     Thu Jul 03 13:20:35 2008 +0200
@@ -20,6 +20,9 @@
 #include "ssdt_tpm.h"
 #include "../config.h"
 #include "../util.h"
+#include "../e820.h"
+
+#define ONEMB 0x100000
 
 #define align16(sz)        (((sz) + 15) & ~15)
 #define fixed_strcpy(d, s) strncpy((d), (s), sizeof(d))
@@ -45,6 +48,140 @@
 
     p = table;
     p[checksum_offset] = -sum;
+}
+
+static int vcpu_to_numa_node (int vcpu_id, int nr_vcpus)
+{
+int div,mod;
+
+    div=nr_vcpus / get_numanodes();
+    mod=nr_vcpus % get_numanodes();
+
+    if ( vcpu_id < mod * (div + 1)) return vcpu_id / (div + 1);
+    return ( ( vcpu_id - (mod * (div + 1)) ) / div ) + mod;
+}
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE 4096
+#endif
+
+static uint64_t guessmemsize (void)
+{
+    uint64_t ret = 0;
+    struct e820entry *map = HVM_E820;
+    int i;
+
+    for ( i = 0; i < *HVM_E820_NR ; i++)
+    {
+        if (map[i].addr == ONEMB )
+            ret+=map[i].size + PAGE_SIZE * 3 + ONEMB;
+        if (map[i].addr == (1ULL << 32))
+            ret+=map[i].size;
+    }
+    return ret;
+}
+
+int construct_srat(struct acpi_20_srat *srat)
+{
+    struct acpi_20_srat_processor *processor;
+    struct acpi_20_srat_memory    *memory;
+    struct e820entry *map = HVM_E820;
+    int i, offset = 0;
+    uint64_t hvm_node_mem;
+
+    memset(srat, 0, sizeof(*srat));
+    srat->header.signature    = ACPI_2_0_SRAT_SIGNATURE;
+    srat->header.revision     = ACPI_2_0_SRAT_REVISION;
+    fixed_strcpy(srat->header.oem_id, ACPI_OEM_ID);
+    fixed_strcpy(srat->header.oem_table_id, ACPI_OEM_TABLE_ID);
+    srat->header.oem_revision = ACPI_OEM_REVISION;
+    srat->header.creator_id   = ACPI_CREATOR_ID;
+    srat->header.creator_revision = ACPI_CREATOR_REVISION;
+    srat->table_revision      = ACPI_SRAT_TABLE_REVISION;
+    offset += sizeof(*srat);
+
+    processor = (struct acpi_20_srat_processor *)(srat + 1);
+    for ( i = 0; i < get_vcpu_nr(); i++ )
+    {
+        memset(processor, 0, sizeof(*processor));
+        processor->type    = ACPI_PROCESSOR_AFFIN;
+        processor->length  = sizeof(*processor);
+        processor->domain  = vcpu_to_numa_node (i, get_vcpu_nr());
+        processor->apic_id = LAPIC_ID(i);
+        processor->flags   = ACPI_LOCAL_APIC_AFFIN_ENABLED;
+        processor->sapic_id= 0;
+        offset += sizeof(*processor);
+        processor++;
+    }
+
+ /*
+  * Equally distribute the memory on all NUMA nodes. Round up the size
+  * of available memory to whole megabytes, as (at least) Linux cannot cope
+  * with uneven NUMA node boundaries. The remaining part of memory will be
+  * assigned to the last NUMA node. The mapping of the first MB is copied
+  * from the E820 map and assigned to node 0
+  */
+    hvm_node_mem = guessmemsize()+ONEMB-1;
+    hvm_node_mem = hvm_node_mem >> 20;
+ /* 64bit/32bit does not work because of missing libgcc */
+    hvm_node_mem = (uint32_t)hvm_node_mem / get_numanodes();
+    hvm_node_mem = hvm_node_mem << 20;
+
+    memory = (struct acpi_20_srat_memory *)(processor);
+    for ( i = 0; i < *HVM_E820_NR; i++ )
+    {
+        if ( map[i].type != E820_RAM ) continue;
+        if ( map[i].addr >= ONEMB ) break;
+
+        memset(memory, 0, sizeof(*memory));
+        memory->type        = ACPI_MEMORY_AFFIN;
+        memory->length      = sizeof(*memory);
+        memory->domain      = 0;
+        memory->base_address_lo = map[i].addr & 0xFFFFFFFFL;
+        memory->base_address_hi = map[i].addr >> 32;
+        memory->length_lo   = map[i].size & 0xFFFFFFFFL;
+        memory->length_hi   = map[i].size >> 32;
+        memory->flags       = ACPI_MEM_AFFIN_ENABLED;
+
+        offset += sizeof(*memory);
+        memory++;
+    }
+
+    for ( i = 0; i < get_numanodes(); i++ )
+    {
+        memset(memory, 0, sizeof(*memory));
+        memory->type        = ACPI_MEMORY_AFFIN;
+        memory->length      = sizeof(*memory);
+        memory->domain      = i;
+        if ( i == 0 )
+        {
+            memory->base_address_lo = ONEMB;
+            memory->base_address_hi = 0;
+            memory->length_lo   = ( hvm_node_mem  - ONEMB ) & 0xFFFFFFFFL;
+            memory->length_hi   = ( hvm_node_mem  - ONEMB ) >> 32;
+        } else
+        if ( i == get_numanodes()-1 )
+        {
+            memory->base_address_lo = (i * hvm_node_mem) & 0xFFFFFFFFL;
+            memory->base_address_hi = (i * hvm_node_mem) >> 32;
+            memory->length_lo   = (guessmemsize()-hvm_node_mem*i) & 
0xFFFFFFFFL;
+            memory->length_hi   = (guessmemsize()-hvm_node_mem*i) >> 32;
+        } else
+        {
+            memory->base_address_lo = (i * hvm_node_mem) & 0xFFFFFFFFL;
+            memory->base_address_hi = (i * hvm_node_mem) >> 32;
+            memory->length_lo   = hvm_node_mem & 0xFFFFFFFFL;
+            memory->length_hi   = hvm_node_mem >> 32;
+        }
+        memory->flags       = ACPI_MEM_AFFIN_ENABLED;
+        offset += sizeof(*memory);
+        memory++;
+    }
+
+    srat->header.length = offset;
+    set_checksum(srat, offsetof(struct acpi_header, checksum), offset);
+
+    return align16(offset);
 }
 
 static int uart_exists(uint16_t uart_base)
@@ -192,6 +329,7 @@
 static int construct_secondary_tables(uint8_t *buf, unsigned long *table_ptrs)
 {
     int offset = 0, nr_tables = 0;
+    struct acpi_20_srat *srat;
     struct acpi_20_madt *madt;
     struct acpi_20_hpet *hpet;
     struct acpi_20_tcpa *tcpa;
@@ -204,6 +342,14 @@
         madt = (struct acpi_20_madt *)&buf[offset];
         offset += construct_madt(madt);
         table_ptrs[nr_tables++] = (unsigned long)madt;
+    }
+
+    /* SRAT. */
+    if ( get_numanodes() > 0 )
+    {
+        srat = (struct acpi_20_srat *)&buf[offset];
+        offset += construct_srat(srat);
+        table_ptrs[nr_tables++] = (unsigned long)srat;
     }
 
     /* HPET. */
diff -r b84c5f2fe83b -r aa69281c1ecf tools/firmware/hvmloader/util.c
--- a/tools/firmware/hvmloader/util.c   Thu Jul 03 13:17:11 2008 +0200
+++ b/tools/firmware/hvmloader/util.c   Thu Jul 03 13:20:35 2008 +0200
@@ -594,6 +594,12 @@
     return (t ? t->nr_vcpus : 1);
 }
 
+int get_numanodes(void)
+{
+    struct hvm_info_table *t = get_hvm_info_table();
+    return (t ? t->numanodes : 1);
+}
+
 int get_acpi_enabled(void)
 {
     struct hvm_info_table *t = get_hvm_info_table();
diff -r b84c5f2fe83b -r aa69281c1ecf tools/firmware/hvmloader/util.h
--- a/tools/firmware/hvmloader/util.h   Thu Jul 03 13:17:11 2008 +0200
+++ b/tools/firmware/hvmloader/util.h   Thu Jul 03 13:20:35 2008 +0200
@@ -104,6 +104,7 @@
 
 /* HVM-builder info. */
 int get_vcpu_nr(void);
+int get_numanodes(void);
 int get_acpi_enabled(void);
 int get_apic_mode(void);
 
diff -r b84c5f2fe83b -r aa69281c1ecf tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Thu Jul 03 13:17:11 2008 +0200
+++ b/tools/python/xen/lowlevel/xc/xc.c Thu Jul 03 13:20:35 2008 +0200
@@ -845,6 +845,18 @@
 
 #endif /* __i386__ || __x86_64__ */
 
+static unsigned hweight_long (unsigned long value)
+{
+int ret=0;
+
+    while (value>0)
+    {
+        if (value&1) ++ret;
+        value>>=1;
+    }
+    return ret;
+}
+
 static PyObject *pyxc_hvm_build(XcObject *self,
                                 PyObject *args,
                                 PyObject *kwds)
@@ -884,6 +896,7 @@
     va_hvm->acpi_enabled = acpi;
     va_hvm->apic_mode    = apic;
     va_hvm->nr_vcpus     = vcpus;
+    va_hvm->numanodes    = hweight_long(nodemask);
     for ( i = 0, sum = 0; i < va_hvm->length; i++ )
         sum += ((uint8_t *)va_hvm)[i];
     va_hvm->checksum = -sum;
diff -r b84c5f2fe83b -r aa69281c1ecf xen/include/public/hvm/hvm_info_table.h
--- a/xen/include/public/hvm/hvm_info_table.h   Thu Jul 03 13:17:11 2008 +0200
+++ b/xen/include/public/hvm/hvm_info_table.h   Thu Jul 03 13:20:35 2008 +0200
@@ -36,6 +36,7 @@
     uint8_t     acpi_enabled;
     uint8_t     apic_mode;
     uint32_t    nr_vcpus;
+    uint32_t    numanodes;
 };
 
 #endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-devel] [PATCH 4/4] hvm: NUMA guest: inject NUMA topology into the guest, Andre Przywara <=