WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

Re: [Xen-devel] Re: [PATCH 2/6][RESEND] xen: Add NUMA support to Xen

* Keir Fraser <Keir.Fraser@xxxxxxxxxxxx> [2006-05-13 04:33]:
> 
> On 12 May 2006, at 16:12, Ryan Harper wrote:
> 
> >>Then in
> >>init_domheap_pages(), there are no calls to memguard, just
> >>work to set up the range for a call to init_heap_pages().  I'm
> >>not sure if I need to use memguard for marking the chunk
> >>boundaries, or if just reserving chunk boundaries that weren't
> >>already on a MAX_ORDER edge via map_alloc() is sufficient.
> 
> Just reserving is sufficient. memguard is a debug-build aid for xen 
> heap only.
> 
> >>Also, I didn't see a way to ensure reserved pages aren't freed via a
> >>call to init_heap_pages() which just clears out a range of bits in
> >>the alloc map.  Should we be worried about that?
> 
> Well, in theory. It's only called in one place though right now, and 
> probably with a physical range below 128MB in all cases. So it's 
> unlikely to straddle a NUMA boundary.

OK.

> 
> Another comment on this patch (2/2): page_to_node() should be defined 
> in numa.c (for now) rather than page_alloc.c (where it will never 
> belong). I know you also directly scan the chunk array to find 
> boundaries to reserve: perhaps for now we could have 
> end_boot_allocator() call init_heap_pages() for each page, then 
> initheap_pages() can compare page_to_node(page) with 
> page_to_node(page-1). If they differ and the latter is not -1 and page 
> is not on a MAX_ORDER boundary, then you do not free the page to the 
> buddy allocator.
> 
> Clearly this will be crappily slow, but it's only used at boot time and 
> as long as it's not *too* bad (which it certainly won't be for short 
> chunk lists) then it'll do until we improve things (probably with a 
> fast constant-time page_to_node() implementation).

I've re-worked this patch to use the new ACPI/x86_64 NUMA infrastructure
I just resent.  Now page_alloc.c only uses MAX_NUMNODES and the
phys_to_nid() function.  I've modifed end_boot_allocator/init_heap_pages
as described above.  

-- 
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
(512) 838-9253   T/L: 678-9253
ryanh@xxxxxxxxxx


diffstat output:
 common/page_alloc.c |  198 ++++++++++++++++++++++++++++++++++++++++------------
 include/xen/mm.h    |    6 +
 2 files changed, 160 insertions(+), 44 deletions(-)

Signed-off-by: Ryan Harper <ryanh@xxxxxxxxxx>
---
diff -r 99e60f5df1d8 xen/common/page_alloc.c
--- a/xen/common/page_alloc.c   Wed May 31 15:36:41 2006
+++ b/xen/common/page_alloc.c   Wed May 31 14:10:01 2006
@@ -4,6 +4,7 @@
  * Simple buddy heap allocator for Xen.
  * 
  * Copyright (c) 2002-2004 K A Fraser
+ * Copyright (c) 2006 IBM Ryan Harper <ryanh@xxxxxxxxxx>
  * 
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -33,6 +34,7 @@
 #include <xen/shadow.h>
 #include <xen/domain_page.h>
 #include <xen/keyhandler.h>
+#include <asm/numa.h>
 #include <asm/page.h>
 
 /*
@@ -246,22 +248,23 @@
 #define pfn_dom_zone_type(_pfn)                                 \
     (((_pfn) <= MAX_DMADOM_PFN) ? MEMZONE_DMADOM : MEMZONE_DOM)
 
-static struct list_head heap[NR_ZONES][MAX_ORDER+1];
-
-static unsigned long avail[NR_ZONES];
+static struct list_head heap[NR_ZONES][MAX_NUMNODES][MAX_ORDER+1];
+
+static unsigned long avail[NR_ZONES][MAX_NUMNODES];
 
 static spinlock_t heap_lock = SPIN_LOCK_UNLOCKED;
 
 void end_boot_allocator(void)
 {
-    unsigned long i, j;
+    unsigned long i, j, k;
     int curr_free = 0, next_free = 0;
 
     memset(avail, 0, sizeof(avail));
 
     for ( i = 0; i < NR_ZONES; i++ )
-        for ( j = 0; j <= MAX_ORDER; j++ )
-            INIT_LIST_HEAD(&heap[i][j]);
+        for ( j = 0; j < MAX_NUMNODES; j++ )
+            for ( k = 0; k <= MAX_ORDER; k++ )
+                INIT_LIST_HEAD(&heap[i][j][k]);
 
     /* Pages that are free now go to the domain sub-allocator. */
     for ( i = 0; i < max_page; i++ )
@@ -271,29 +274,58 @@
         if ( next_free )
             map_alloc(i+1, 1); /* prevent merging in free_heap_pages() */
         if ( curr_free )
-            free_heap_pages(pfn_dom_zone_type(i), mfn_to_page(i), 0);
-    }
-}
-
-/* Hand the specified arbitrary page range to the specified heap zone. */
+            init_heap_pages(pfn_dom_zone_type(i), mfn_to_page(i), 1);
+    }
+}
+
+/* 
+ * Hand the specified arbitrary page range to the specified heap zone
+ * checking the node_id of the previous page.  If they differ and the
+ * latter is not on a MAX_ORDER boundary, then we reserve the page by
+ * not freeing it to the buddy allocator.
+ */
+#define MAX_ORDER_ALIGNED (1UL << (MAX_ORDER))
 void init_heap_pages(
     unsigned int zone, struct page_info *pg, unsigned long nr_pages)
 {
+    unsigned int nid_curr,nid_prev;
     unsigned long i;
 
     ASSERT(zone < NR_ZONES);
 
+    if ( likely(page_to_mfn(pg) != 0) )
+        nid_prev = phys_to_nid(page_to_maddr(pg-1));
+    else
+        nid_prev = phys_to_nid(page_to_maddr(pg));
+
     for ( i = 0; i < nr_pages; i++ )
-        free_heap_pages(zone, pg+i, 0);
-}
-
+    {
+        nid_curr = phys_to_nid(page_to_maddr(pg+i));
+
+        /*
+         * free pages of the same node, or if they differ, but are on a
+         * MAX_ORDER alignement boundary (which already get reserved)
+         */
+         if ( (nid_curr == nid_prev) || (page_to_maddr(pg+i) &
+                                         MAX_ORDER_ALIGNED) )
+             free_heap_pages(zone, pg+i, 0);
+         else
+             printk("Reserving non-aligned node boundary @ mfn %lu\n",
+                    page_to_mfn(pg+i));
+
+        nid_prev = nid_curr;
+    }
+}
 
 /* Allocate 2^@order contiguous pages. */
-struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order)
-{
-    int i;
+struct page_info *alloc_heap_pages(unsigned int zone, unsigned int cpu,
+                                   unsigned int order)
+{
+    int i,j, node;
     struct page_info *pg;
 
+    ASSERT(cpu_to_node[cpu] >= 0);
+    ASSERT(cpu_to_node[cpu] < num_online_nodes());
     ASSERT(zone < NR_ZONES);
 
     if ( unlikely(order > MAX_ORDER) )
@@ -301,29 +333,38 @@
 
     spin_lock(&heap_lock);
 
-    /* Find smallest order which can satisfy the request. */
-    for ( i = order; i <= MAX_ORDER; i++ )
-        if ( !list_empty(&heap[zone][i]) )
-            goto found;
+    /* start with requested node, but exhaust all node memory
+     * in requested zone before failing */
+    for ( i = 0; i < num_online_nodes(); i++ )
+    {
+        node = (cpu_to_node[cpu]+i) % num_online_nodes();
+        /* Find smallest order which can satisfy the request. */
+        for ( j = order; j <= MAX_ORDER; j++ )
+        {
+            if ( !list_empty(&heap[zone][node][j]) )
+                goto found;
+        }
+    }
 
     /* No suitable memory blocks. Fail the request. */
     spin_unlock(&heap_lock);
     return NULL;
 
  found: 
-    pg = list_entry(heap[zone][i].next, struct page_info, list);
+    pg = list_entry(heap[zone][node][j].next, struct page_info, list);
     list_del(&pg->list);
 
     /* We may have to halve the chunk a number of times. */
-    while ( i != order )
-    {
-        PFN_ORDER(pg) = --i;
-        list_add_tail(&pg->list, &heap[zone][i]);
-        pg += 1 << i;
+    while ( j != order )
+    {
+        PFN_ORDER(pg) = --j;
+        list_add_tail(&pg->list, &heap[zone][node][j]);
+        pg += 1 << j;
     }
     
     map_alloc(page_to_mfn(pg), 1 << order);
-    avail[zone] -= 1 << order;
+    ASSERT(avail[zone][node] >= (1 << order));
+    avail[zone][node] -= 1 << order;
 
     spin_unlock(&heap_lock);
 
@@ -336,14 +377,17 @@
     unsigned int zone, struct page_info *pg, unsigned int order)
 {
     unsigned long mask;
+    int node = phys_to_nid(page_to_maddr(pg));
 
     ASSERT(zone < NR_ZONES);
     ASSERT(order <= MAX_ORDER);
+    ASSERT(node >= 0);
+    ASSERT(node < num_online_nodes());
 
     spin_lock(&heap_lock);
 
     map_free(page_to_mfn(pg), 1 << order);
-    avail[zone] += 1 << order;
+    avail[zone][node] += 1 << order;
     
     /* Merge chunks as far as possible. */
     while ( order < MAX_ORDER )
@@ -369,10 +413,13 @@
         }
         
         order++;
+
+        /* after merging, pg should be in the same node */
+        ASSERT(phys_to_nid(page_to_maddr(pg)) == node );
     }
 
     PFN_ORDER(pg) = order;
-    list_add_tail(&pg->list, &heap[zone][order]);
+    list_add_tail(&pg->list, &heap[zone][node][order]);
 
     spin_unlock(&heap_lock);
 }
@@ -467,7 +514,7 @@
     int i;
 
     local_irq_save(flags);
-    pg = alloc_heap_pages(MEMZONE_XEN, order);
+    pg = alloc_heap_pages(MEMZONE_XEN, smp_processor_id(), order);
     local_irq_restore(flags);
 
     if ( unlikely(pg == NULL) )
@@ -531,8 +578,8 @@
 }
 
 
-struct page_info *alloc_domheap_pages(
-    struct domain *d, unsigned int order, unsigned int flags)
+struct page_info *__alloc_domheap_pages(
+    struct domain *d, unsigned int cpu, unsigned int order, unsigned int flags)
 {
     struct page_info *pg = NULL;
     cpumask_t mask;
@@ -542,17 +589,17 @@
 
     if ( !(flags & ALLOC_DOM_DMA) )
     {
-        pg = alloc_heap_pages(MEMZONE_DOM, order);
+        pg = alloc_heap_pages(MEMZONE_DOM, cpu, order);
         /* Failure? Then check if we can fall back to the DMA pool. */
         if ( unlikely(pg == NULL) &&
              ((order > MAX_ORDER) ||
-              (avail[MEMZONE_DMADOM] <
+              (avail_heap_pages(MEMZONE_DMADOM,-1) <
                (lowmem_emergency_pool_pages + (1UL << order)))) )
             return NULL;
     }
 
     if ( pg == NULL )
-        if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, order)) == NULL )
+        if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, cpu, order)) == NULL )
             return NULL;
 
     mask = pg->u.free.cpumask;
@@ -615,6 +662,13 @@
     spin_unlock(&d->page_alloc_lock);
     
     return pg;
+}
+
+inline struct page_info *alloc_domheap_pages(
+    struct domain *d, unsigned int order, unsigned int flags)
+{
+    return __alloc_domheap_pages(d, smp_processor_id(), order, flags);
+
 }
 
 
@@ -690,13 +744,27 @@
 }
 
 
+unsigned long avail_heap_pages(int zone, int node)
+{
+    int i,j;
+    unsigned long free_pages = 0;
+   
+    for (i=0; i<NR_ZONES; i++)
+        if ( (zone == -1) || (zone == i) )
+            for (j=0; j<num_online_nodes(); j++)
+                if ( (node == -1) || (node == j) )
+                    free_pages += avail[i][j];            
+
+    return free_pages;
+}
+
 unsigned long avail_domheap_pages(void)
 {
     unsigned long avail_nrm, avail_dma;
-
-    avail_nrm = avail[MEMZONE_DOM];
-
-    avail_dma = avail[MEMZONE_DMADOM];
+    
+    avail_nrm = avail_heap_pages(MEMZONE_DOM,-1);
+
+    avail_dma = avail_heap_pages(MEMZONE_DMADOM,-1);
     if ( avail_dma > lowmem_emergency_pool_pages )
         avail_dma -= lowmem_emergency_pool_pages;
     else
@@ -705,6 +773,10 @@
     return avail_nrm + avail_dma;
 }
 
+unsigned long avail_nodeheap_pages(int node)
+{
+    return avail_heap_pages(-1, node);
+}
 
 static void pagealloc_keyhandler(unsigned char key)
 {
@@ -712,9 +784,9 @@
     printk("    Xen heap: %lukB free\n"
            "    DMA heap: %lukB free\n"
            "    Dom heap: %lukB free\n",
-           avail[MEMZONE_XEN]<<(PAGE_SHIFT-10),
-           avail[MEMZONE_DMADOM]<<(PAGE_SHIFT-10),
-           avail[MEMZONE_DOM]<<(PAGE_SHIFT-10));
+           avail_heap_pages(MEMZONE_XEN, -1) << (PAGE_SHIFT-10), 
+           avail_heap_pages(MEMZONE_DMADOM, -1) <<(PAGE_SHIFT-10), 
+           avail_heap_pages(MEMZONE_DOM, -1) <<(PAGE_SHIFT-10));
 }
 
 
@@ -776,6 +848,46 @@
     } while ( (NOW() - start) < MILLISECS(1) );
 }
 
+static unsigned long count_bucket(struct list_head* l, int order)
+{
+   unsigned long total_pages = 0;
+   int pages = 1 << order;
+   struct page_info *pg;
+
+   list_for_each_entry(pg, l, list)
+       total_pages += pages;
+
+   return total_pages;
+}
+
+static void dump_heap(unsigned char key)
+{
+    s_time_t       now = NOW();
+    int i,j,k;
+    unsigned long total;
+
+    printk("'%c' pressed -> dumping heap info (now-0x%X:%08X)\n", key,
+           (u32)(now>>32), (u32)now);
+
+    for (i=0; i<NR_ZONES; i++ )
+        for (j=0;j<MAX_NUMNODES;j++)
+            for (k=0;k<=MAX_ORDER;k++)
+                if ( !list_empty(&heap[i][j][k]) )
+                {
+                    total = count_bucket(&heap[i][j][k], k);
+                    printk("heap[%d][%d][%d]-> %lu pages\n",
+                           i, j, k, total);
+                }
+}
+
+static __init int register_heap_trigger(void)
+{
+   register_keyhandler('H', dump_heap, "dump heap info");
+   return 0;
+}
+__initcall(register_heap_trigger);
+
+
 static __init int page_scrub_init(void)
 {
     open_softirq(PAGE_SCRUB_SOFTIRQ, page_scrub_softirq);
diff -r 99e60f5df1d8 xen/include/xen/mm.h
--- a/xen/include/xen/mm.h      Wed May 31 15:36:41 2006
+++ b/xen/include/xen/mm.h      Wed May 31 14:10:01 2006
@@ -45,7 +45,8 @@
 /* Generic allocator. These functions are *not* interrupt-safe. */
 void init_heap_pages(
     unsigned int zone, struct page_info *pg, unsigned long nr_pages);
-struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order);
+struct page_info *alloc_heap_pages(
+    unsigned int zone, unsigned int cpu, unsigned int order);
 void free_heap_pages(
     unsigned int zone, struct page_info *pg, unsigned int order);
 void scrub_heap_pages(void);
@@ -61,8 +62,11 @@
 void init_domheap_pages(paddr_t ps, paddr_t pe);
 struct page_info *alloc_domheap_pages(
     struct domain *d, unsigned int order, unsigned int flags);
+struct page_info *__alloc_domheap_pages(
+    struct domain *d, unsigned int cpu, unsigned int order, unsigned int 
flags);
 void free_domheap_pages(struct page_info *pg, unsigned int order);
 unsigned long avail_domheap_pages(void);
+unsigned long avail_heap_pages(int zone, int node);
 #define alloc_domheap_page(d) (alloc_domheap_pages(d,0,0))
 #define free_domheap_page(p)  (free_domheap_pages(p,0))
 

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel