WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

Re: [Xen-devel] [PATCH 2/6] xen: Add NUMA support to Xen

* Keir Fraser <Keir.Fraser@xxxxxxxxxxxx> [2006-05-02 11:23]:
> 
> On 2 May 2006, at 15:53, Ryan Harper wrote:
> 
> >>Loops over every memory chunk structure on the alloc/free paths aren't
> >>going to get merged. There's no need for it -- in most cases memory
> >>chunks are probably aligned on a MAX_ORDER boundary (or they will be
> >>when I reduce MAX_ORDER, which requires me to fix up our Linux swiotlb
> >>a bit first). When that isn't the case you can simply reserve guard
> >>pages at the start and end of such chunks to avoid cross-chunk 
> >>merging.
> >
> >I'll toss page_spans_chunk() and the user in the free path, use some
> >guard pages and resubmit.
> 
> Great. Please do make it conditional on the start/end not being on a 
> MAX_ORDER boundary though -- that's a worthwhile optimisation to avoid 
> the guard page.
> 

I've taken out the CONFIG_NUMA ifdefs and dumped the page_spans_chunk()
marking the chunk boundaries if they aren't on MAX_ORDER boundaries and
if not allocated.  I'm not clear on the difference between 
reserving the sensitive pages in the alloc bitmap (via map_alloc())
and the memguard routines.  For instance, in init_xenheap_pages(),
the range is guarded, and then the range is handed to the heap (call to
init_heap_pages() which clears the alloc bitmap).  Then in
init_domheap_pages(), there are no calls to memguard, just
work to set up the range for a call to init_heap_pages().  I'm
not sure if I need to use memguard for marking the chunk
boundaries, or if just reserving chunk boundaries that weren't
already on a MAX_ORDER edge via map_alloc() is sufficient.

Also, I didn't see a way to ensure reserved pages aren't freed via a
call to init_heap_pages() which just clears out a range of bits in
the alloc map.  Should we be worried about that?

Attached is what the current working patch looks like.  Let me know if
this is more to your liking.  If so, I'll re-spin the whole patchset and
test it across the set of test machines we have (NUMA and non-NUMA).

-- 
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
(512) 838-9253   T/L: 678-9253
ryanh@xxxxxxxxxx


---
diff -r 38ba1fe5009c xen/common/page_alloc.c
--- a/xen/common/page_alloc.c   Tue May  9 02:23:08 2006
+++ b/xen/common/page_alloc.c   Mon May  8 21:27:53 2006
@@ -4,6 +4,7 @@
  * Simple buddy heap allocator for Xen.
  * 
  * Copyright (c) 2002-2004 K A Fraser
+ * Copyright (c) 2006 IBM
  * 
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -34,6 +35,25 @@
 #include <xen/domain_page.h>
 #include <xen/keyhandler.h>
 #include <asm/page.h>
+#include <xen/nodemask.h>
+#include <xen/numa.h>
+
+extern int num_memory_chunks;
+extern node_memory_chunk_t node_memory_chunk[];
+extern int cpu_to_node[];
+
+/* map a given page_info to the node it came from */
+int page_to_node(struct page_info *pg)
+{
+    node_memory_chunk_t *c = node_memory_chunk;
+    u64 pg_paddr = page_to_maddr(pg);
+
+    for (; c < (node_memory_chunk+num_memory_chunks); c++)
+        if ( (pg_paddr >= c->start_paddr) && (pg_paddr <= c->end_paddr) )
+            return (int)c->nid;
+
+    return -1;
+}
 
 /*
  * Comma-separated list of hexadecimal page numbers containing bad bytes.
@@ -246,22 +266,43 @@
 #define pfn_dom_zone_type(_pfn)                                 \
     (((_pfn) <= MAX_DMADOM_PFN) ? MEMZONE_DMADOM : MEMZONE_DOM)
 
-static struct list_head heap[NR_ZONES][MAX_ORDER+1];
-
-static unsigned long avail[NR_ZONES];
+static struct list_head heap[NR_ZONES][MAX_NUMNODES][MAX_ORDER+1];
+
+static unsigned long avail[NR_ZONES][MAX_NUMNODES];
 
 static spinlock_t heap_lock = SPIN_LOCK_UNLOCKED;
 
+#define NOT_MAX_ORDER_ALIGNED ((1UL << MAX_ORDER)-1)
 void end_boot_allocator(void)
 {
-    unsigned long i, j;
+    unsigned long i, j, k;
     int curr_free = 0, next_free = 0;
 
     memset(avail, 0, sizeof(avail));
 
     for ( i = 0; i < NR_ZONES; i++ )
-        for ( j = 0; j <= MAX_ORDER; j++ )
-            INIT_LIST_HEAD(&heap[i][j]);
+        for ( j = 0; j < MAX_NUMNODES; j++ )
+            for ( k = 0; k <= MAX_ORDER; k++ )
+                INIT_LIST_HEAD(&heap[i][j][k]);
+
+    /* mark NUMA chunk boundaries in multi-node systems */
+    if ( num_online_nodes() > 1 )
+    {
+        node_memory_chunk_t *c = node_memory_chunk;
+
+        /* sacrifice the ends of a chunk if not MAX_ORDER 
+           aligned to prevent merging across chunks */
+        for (; c < (node_memory_chunk+num_memory_chunks); c++ )
+        {
+            if ( (c->start_paddr & NOT_MAX_ORDER_ALIGNED) &&
+                 !allocated_in_map(paddr_to_pfn(c->start_paddr)) )
+                map_alloc(paddr_to_pfn(c->start_paddr), 1);
+
+            if ( (c->end_paddr & NOT_MAX_ORDER_ALIGNED) &&
+                 !allocated_in_map(paddr_to_pfn(c->end_paddr)) )
+                map_alloc(paddr_to_pfn(c->end_paddr), 1);
+        }
+    }
 
     /* Pages that are free now go to the domain sub-allocator. */
     for ( i = 0; i < max_page; i++ )
@@ -289,11 +330,14 @@
 
 
 /* Allocate 2^@order contiguous pages. */
-struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order)
-{
-    int i;
+struct page_info *alloc_heap_pages(unsigned int zone, unsigned int cpu,
+                                   unsigned int order)
+{
+    int i,j, node;
     struct page_info *pg;
 
+    ASSERT(cpu_to_node[cpu] >= 0);
+    ASSERT(cpu_to_node[cpu] < num_online_nodes());
     ASSERT(zone < NR_ZONES);
 
     if ( unlikely(order > MAX_ORDER) )
@@ -301,29 +345,36 @@
 
     spin_lock(&heap_lock);
 
-    /* Find smallest order which can satisfy the request. */
-    for ( i = order; i <= MAX_ORDER; i++ )
-        if ( !list_empty(&heap[zone][i]) )
-            goto found;
+    /* start with requested node, but exhaust all node memory
+     * in requested zone before failing */
+    for ( i = 0; i < num_online_nodes(); i++ )
+    {
+        node = (cpu_to_node[cpu]+i) % num_online_nodes();
+        /* Find smallest order which can satisfy the request. */
+        for ( j = order; j <= MAX_ORDER; j++ ) {
+            if ( !list_empty(&heap[zone][node][j]) )
+                goto found;
+        }
+    }
 
     /* No suitable memory blocks. Fail the request. */
     spin_unlock(&heap_lock);
     return NULL;
 
  found: 
-    pg = list_entry(heap[zone][i].next, struct page_info, list);
+    pg = list_entry(heap[zone][node][j].next, struct page_info, list);
     list_del(&pg->list);
 
     /* We may have to halve the chunk a number of times. */
-    while ( i != order )
-    {
-        PFN_ORDER(pg) = --i;
-        list_add_tail(&pg->list, &heap[zone][i]);
-        pg += 1 << i;
+    while ( j != order )
+    {
+        PFN_ORDER(pg) = --j;
+        list_add_tail(&pg->list, &heap[zone][node][j]);
+        pg += 1 << j;
     }
     
     map_alloc(page_to_mfn(pg), 1 << order);
-    avail[zone] -= 1 << order;
+    avail[zone][node] -= 1 << order;
 
     spin_unlock(&heap_lock);
 
@@ -336,14 +387,17 @@
     unsigned int zone, struct page_info *pg, unsigned int order)
 {
     unsigned long mask;
+    int node = page_to_node(pg);
 
     ASSERT(zone < NR_ZONES);
     ASSERT(order <= MAX_ORDER);
+    ASSERT(node >= 0);
+    ASSERT(node < num_online_nodes());
 
     spin_lock(&heap_lock);
 
     map_free(page_to_mfn(pg), 1 << order);
-    avail[zone] += 1 << order;
+    avail[zone][node] += 1 << order;
     
     /* Merge chunks as far as possible. */
     while ( order < MAX_ORDER )
@@ -372,7 +426,7 @@
     }
 
     PFN_ORDER(pg) = order;
-    list_add_tail(&pg->list, &heap[zone][order]);
+    list_add_tail(&pg->list, &heap[zone][node][order]);
 
     spin_unlock(&heap_lock);
 }
@@ -467,7 +521,7 @@
     int i;
 
     local_irq_save(flags);
-    pg = alloc_heap_pages(MEMZONE_XEN, order);
+    pg = alloc_heap_pages(MEMZONE_XEN, smp_processor_id(), order);
     local_irq_restore(flags);
 
     if ( unlikely(pg == NULL) )
@@ -531,8 +585,8 @@
 }
 
 
-struct page_info *alloc_domheap_pages(
-    struct domain *d, unsigned int order, unsigned int flags)
+struct page_info *__alloc_domheap_pages(
+    struct domain *d, unsigned int cpu, unsigned int order, unsigned int flags)
 {
     struct page_info *pg = NULL;
     cpumask_t mask;
@@ -542,17 +596,17 @@
 
     if ( !(flags & ALLOC_DOM_DMA) )
     {
-        pg = alloc_heap_pages(MEMZONE_DOM, order);
+        pg = alloc_heap_pages(MEMZONE_DOM, cpu, order);
         /* Failure? Then check if we can fall back to the DMA pool. */
-        if ( unlikely(pg == NULL) &&
-             ((order > MAX_ORDER) ||
-              (avail[MEMZONE_DMADOM] <
+        if ( unlikely(pg == NULL) 
+             && ((order > MAX_ORDER) ||
+              (avail_heap_pages(MEMZONE_DMADOM,-1) <
                (lowmem_emergency_pool_pages + (1UL << order)))) )
             return NULL;
     }
 
     if ( pg == NULL )
-        if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, order)) == NULL )
+        if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, cpu, order)) == NULL )
             return NULL;
 
     mask = pg->u.free.cpumask;
@@ -615,6 +669,13 @@
     spin_unlock(&d->page_alloc_lock);
     
     return pg;
+}
+
+inline struct page_info *alloc_domheap_pages(
+    struct domain *d, unsigned int order, unsigned int flags)
+{
+    return __alloc_domheap_pages(d, smp_processor_id(), order, flags);
+
 }
 
 
@@ -690,13 +751,27 @@
 }
 
 
+unsigned long avail_heap_pages(int zone, int node)
+{
+    int i,j;
+    unsigned long free_pages = 0;
+   
+    for (i=0; i<NR_ZONES; i++)
+        if ( (zone == -1) || (zone == i) )
+            for (j=0; j<num_online_nodes(); j++)
+                if ( (node == -1) || (node == j) )
+                    free_pages += avail[i][j];            
+
+    return free_pages;
+}
+
 unsigned long avail_domheap_pages(void)
 {
     unsigned long avail_nrm, avail_dma;
-
-    avail_nrm = avail[MEMZONE_DOM];
-
-    avail_dma = avail[MEMZONE_DMADOM];
+    
+    avail_nrm = avail_heap_pages(MEMZONE_DOM,-1);
+
+    avail_dma = avail_heap_pages(MEMZONE_DMADOM,-1);
     if ( avail_dma > lowmem_emergency_pool_pages )
         avail_dma -= lowmem_emergency_pool_pages;
     else
@@ -705,6 +780,10 @@
     return avail_nrm + avail_dma;
 }
 
+unsigned long avail_nodeheap_pages(int node)
+{
+    return avail_heap_pages(-1, node);
+}
 
 static void pagealloc_keyhandler(unsigned char key)
 {
@@ -712,9 +791,9 @@
     printk("    Xen heap: %lukB free\n"
            "    DMA heap: %lukB free\n"
            "    Dom heap: %lukB free\n",
-           avail[MEMZONE_XEN]<<(PAGE_SHIFT-10),
-           avail[MEMZONE_DMADOM]<<(PAGE_SHIFT-10),
-           avail[MEMZONE_DOM]<<(PAGE_SHIFT-10));
+           avail_heap_pages(MEMZONE_XEN, -1) << (PAGE_SHIFT-10), 
+           avail_heap_pages(MEMZONE_DMADOM, -1) <<(PAGE_SHIFT-10), 
+           avail_heap_pages(MEMZONE_DOM, -1) <<(PAGE_SHIFT-10));
 }
 
 
diff -r 38ba1fe5009c xen/include/xen/mm.h
--- a/xen/include/xen/mm.h      Tue May  9 02:23:08 2006
+++ b/xen/include/xen/mm.h      Mon May  8 21:27:53 2006
@@ -45,7 +45,8 @@
 /* Generic allocator. These functions are *not* interrupt-safe. */
 void init_heap_pages(
     unsigned int zone, struct page_info *pg, unsigned long nr_pages);
-struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order);
+struct page_info *alloc_heap_pages(
+    unsigned int zone, unsigned int cpu, unsigned int order);
 void free_heap_pages(
     unsigned int zone, struct page_info *pg, unsigned int order);
 void scrub_heap_pages(void);
@@ -61,8 +62,11 @@
 void init_domheap_pages(paddr_t ps, paddr_t pe);
 struct page_info *alloc_domheap_pages(
     struct domain *d, unsigned int order, unsigned int flags);
+struct page_info *__alloc_domheap_pages(
+    struct domain *d, unsigned int cpu, unsigned int order, unsigned int 
flags);
 void free_domheap_pages(struct page_info *pg, unsigned int order);
 unsigned long avail_domheap_pages(void);
+unsigned long avail_heap_pages(int zone, int node);
 #define alloc_domheap_page(d) (alloc_domheap_pages(d,0,0))
 #define free_domheap_page(p)  (free_domheap_pages(p,0))
 
diff -r 38ba1fe5009c xen/include/xen/numa.h
--- a/xen/include/xen/numa.h    Tue May  9 02:23:08 2006
+++ b/xen/include/xen/numa.h    Mon May  8 21:27:53 2006
@@ -35,6 +35,8 @@
 extern int cpu_to_node[];
 extern cpumask_t node_to_cpumask[];
 
+int page_to_node(struct page_info *pg);
+
 int numa_init(void);
 
 #endif /* _XEN_NUMA_H */

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel