Re: [Xen-devel] Re: [PATCH 2/6][RESEND] xen: Add NUMA support to

* Keir Fraser <Keir.Fraser@xxxxxxxxxxxx> [2006-05-13 04:33]:
> 
> On 12 May 2006, at 16:12, Ryan Harper wrote:
> 
> >>Then in
> >>init_domheap_pages(), there are no calls to memguard, just
> >>work to set up the range for a call to init_heap_pages().  I'm
> >>not sure if I need to use memguard for marking the chunk
> >>boundaries, or if just reserving chunk boundaries that weren't
> >>already on a MAX_ORDER edge via map_alloc() is sufficient.
> 
> Just reserving is sufficient. memguard is a debug-build aid for xen 
> heap only.
> 
> >>Also, I didn't see a way to ensure reserved pages aren't freed via a
> >>call to init_heap_pages() which just clears out a range of bits in
> >>the alloc map.  Should we be worried about that?
> 
> Well, in theory. It's only called in one place though right now, and 
> probably with a physical range below 128MB in all cases. So it's 
> unlikely to straddle a NUMA boundary.

OK.

> 
> Another comment on this patch (2/2): page_to_node() should be defined 
> in numa.c (for now) rather than page_alloc.c (where it will never 
> belong). I know you also directly scan the chunk array to find 
> boundaries to reserve: perhaps for now we could have 
> end_boot_allocator() call init_heap_pages() for each page, then 
> initheap_pages() can compare page_to_node(page) with 
> page_to_node(page-1). If they differ and the latter is not -1 and page 
> is not on a MAX_ORDER boundary, then you do not free the page to the 
> buddy allocator.
> 
> Clearly this will be crappily slow, but it's only used at boot time and 
> as long as it's not *too* bad (which it certainly won't be for short 
> chunk lists) then it'll do until we improve things (probably with a 
> fast constant-time page_to_node() implementation).

I've re-worked this patch to use the new ACPI/x86_64 NUMA infrastructure
I just resent.  Now page_alloc.c only uses MAX_NUMNODES and the
phys_to_nid() function.  I've modifed end_boot_allocator/init_heap_pages
as described above.  

-- 
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
(512) 838-9253   T/L: 678-9253
ryanh@xxxxxxxxxx


diffstat output:
 common/page_alloc.c |  198 ++++++++++++++++++++++++++++++++++++++++------------
 include/xen/mm.h    |    6 +
 2 files changed, 160 insertions(+), 44 deletions(-)

Signed-off-by: Ryan Harper <ryanh@xxxxxxxxxx>
---
diff -r 99e60f5df1d8 xen/common/page_alloc.c
--- a/xen/common/page_alloc.c   Wed May 31 15:36:41 2006
+++ b/xen/common/page_alloc.c   Wed May 31 14:10:01 2006
@@ -4,6 +4,7 @@
  * Simple buddy heap allocator for Xen.
  * 
  * Copyright (c) 2002-2004 K A Fraser
+ * Copyright (c) 2006 IBM Ryan Harper <ryanh@xxxxxxxxxx>
  * 
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -33,6 +34,7 @@
 #include <xen/shadow.h>
 #include <xen/domain_page.h>
 #include <xen/keyhandler.h>
+#include <asm/numa.h>
 #include <asm/page.h>
 
 /*
@@ -246,22 +248,23 @@
 #define pfn_dom_zone_type(_pfn)                                 \
     (((_pfn) <= MAX_DMADOM_PFN) ? MEMZONE_DMADOM : MEMZONE_DOM)
 
-static struct list_head heap[NR_ZONES][MAX_ORDER+1];
-
-static unsigned long avail[NR_ZONES];
+static struct list_head heap[NR_ZONES][MAX_NUMNODES][MAX_ORDER+1];
+
+static unsigned long avail[NR_ZONES][MAX_NUMNODES];
 
 static spinlock_t heap_lock = SPIN_LOCK_UNLOCKED;
 
 void end_boot_allocator(void)
 {
-    unsigned long i, j;
+    unsigned long i, j, k;
     int curr_free = 0, next_free = 0;
 
     memset(avail, 0, sizeof(avail));
 
     for ( i = 0; i < NR_ZONES; i++ )
-        for ( j = 0; j <= MAX_ORDER; j++ )
-            INIT_LIST_HEAD(&heap[i][j]);
+        for ( j = 0; j < MAX_NUMNODES; j++ )
+            for ( k = 0; k <= MAX_ORDER; k++ )
+                INIT_LIST_HEAD(&heap[i][j][k]);
 
     /* Pages that are free now go to the domain sub-allocator. */
     for ( i = 0; i < max_page; i++ )
@@ -271,29 +274,58 @@
         if ( next_free )
             map_alloc(i+1, 1); /* prevent merging in free_heap_pages() */
         if ( curr_free )
-            free_heap_pages(pfn_dom_zone_type(i), mfn_to_page(i), 0);
-    }
-}
-
-/* Hand the specified arbitrary page range to the specified heap zone. */
+            init_heap_pages(pfn_dom_zone_type(i), mfn_to_page(i), 1);
+    }
+}
+
+/* 
+ * Hand the specified arbitrary page range to the specified heap zone
+ * checking the node_id of the previous page.  If they differ and the
+ * latter is not on a MAX_ORDER boundary, then we reserve the page by
+ * not freeing it to the buddy allocator.
+ */
+#define MAX_ORDER_ALIGNED (1UL << (MAX_ORDER))
 void init_heap_pages(
     unsigned int zone, struct page_info *pg, unsigned long nr_pages)
 {
+    unsigned int nid_curr,nid_prev;
     unsigned long i;
 
     ASSERT(zone < NR_ZONES);
 
+    if ( likely(page_to_mfn(pg) != 0) )
+        nid_prev = phys_to_nid(page_to_maddr(pg-1));
+    else
+        nid_prev = phys_to_nid(page_to_maddr(pg));
+
     for ( i = 0; i < nr_pages; i++ )
-        free_heap_pages(zone, pg+i, 0);
-}
-
+    {
+        nid_curr = phys_to_nid(page_to_maddr(pg+i));
+
+        /*
+         * free pages of the same node, or if they differ, but are on a
+         * MAX_ORDER alignement boundary (which already get reserved)
+         */
+         if ( (nid_curr == nid_prev) || (page_to_maddr(pg+i) &
+                                         MAX_ORDER_ALIGNED) )
+             free_heap_pages(zone, pg+i, 0);
+         else
+             printk("Reserving non-aligned node boundary @ mfn %lu\n",
+                    page_to_mfn(pg+i));
+
+        nid_prev = nid_curr;
+    }
+}
 
 /* Allocate 2^@order contiguous pages. */
-struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order)
-{
-    int i;
+struct page_info *alloc_heap_pages(unsigned int zone, unsigned int cpu,
+                                   unsigned int order)
+{
+    int i,j, node;
     struct page_info *pg;
 
+    ASSERT(cpu_to_node[cpu] >= 0);
+    ASSERT(cpu_to_node[cpu] < num_online_nodes());
     ASSERT(zone < NR_ZONES);
 
     if ( unlikely(order > MAX_ORDER) )
@@ -301,29 +333,38 @@
 
     spin_lock(&heap_lock);
 
-    /* Find smallest order which can satisfy the request. */
-    for ( i = order; i <= MAX_ORDER; i++ )
-        if ( !list_empty(&heap[zone][i]) )
-            goto found;
+    /* start with requested node, but exhaust all node memory
+     * in requested zone before failing */
+    for ( i = 0; i < num_online_nodes(); i++ )
+    {
+        node = (cpu_to_node[cpu]+i) % num_online_nodes();
+        /* Find smallest order which can satisfy the request. */
+        for ( j = order; j <= MAX_ORDER; j++ )
+        {
+            if ( !list_empty(&heap[zone][node][j]) )
+                goto found;
+        }
+    }
 
     /* No suitable memory blocks. Fail the request. */
     spin_unlock(&heap_lock);
     return NULL;
 
  found: 
-    pg = list_entry(heap[zone][i].next, struct page_info, list);
+    pg = list_entry(heap[zone][node][j].next, struct page_info, list);
     list_del(&pg->list);
 
     /* We may have to halve the chunk a number of times. */
-    while ( i != order )
-    {
-        PFN_ORDER(pg) = --i;
-        list_add_tail(&pg->list, &heap[zone][i]);
-        pg += 1 << i;
+    while ( j != order )
+    {
+        PFN_ORDER(pg) = --j;
+        list_add_tail(&pg->list, &heap[zone][node][j]);
+        pg += 1 << j;
     }
     
     map_alloc(page_to_mfn(pg), 1 << order);
-    avail[zone] -= 1 << order;
+    ASSERT(avail[zone][node] >= (1 << order));
+    avail[zone][node] -= 1 << order;
 
     spin_unlock(&heap_lock);
 
@@ -336,14 +377,17 @@
     unsigned int zone, struct page_info *pg, unsigned int order)
 {
     unsigned long mask;
+    int node = phys_to_nid(page_to_maddr(pg));
 
     ASSERT(zone < NR_ZONES);
     ASSERT(order <= MAX_ORDER);
+    ASSERT(node >= 0);
+    ASSERT(node < num_online_nodes());
 
     spin_lock(&heap_lock);
 
     map_free(page_to_mfn(pg), 1 << order);
-    avail[zone] += 1 << order;
+    avail[zone][node] += 1 << order;
     
     /* Merge chunks as far as possible. */
     while ( order < MAX_ORDER )
@@ -369,10 +413,13 @@
         }
         
         order++;
+
+        /* after merging, pg should be in the same node */
+        ASSERT(phys_to_nid(page_to_maddr(pg)) == node );
     }
 
     PFN_ORDER(pg) = order;
-    list_add_tail(&pg->list, &heap[zone][order]);
+    list_add_tail(&pg->list, &heap[zone][node][order]);
 
     spin_unlock(&heap_lock);
 }
@@ -467,7 +514,7 @@
     int i;
 
     local_irq_save(flags);
-    pg = alloc_heap_pages(MEMZONE_XEN, order);
+    pg = alloc_heap_pages(MEMZONE_XEN, smp_processor_id(), order);
     local_irq_restore(flags);
 
     if ( unlikely(pg == NULL) )
@@ -531,8 +578,8 @@
 }
 
 
-struct page_info *alloc_domheap_pages(
-    struct domain *d, unsigned int order, unsigned int flags)
+struct page_info *__alloc_domheap_pages(
+    struct domain *d, unsigned int cpu, unsigned int order, unsigned int flags)
 {
     struct page_info *pg = NULL;
     cpumask_t mask;
@@ -542,17 +589,17 @@
 
     if ( !(flags & ALLOC_DOM_DMA) )
     {
-        pg = alloc_heap_pages(MEMZONE_DOM, order);
+        pg = alloc_heap_pages(MEMZONE_DOM, cpu, order);
         /* Failure? Then check if we can fall back to the DMA pool. */
         if ( unlikely(pg == NULL) &&
              ((order > MAX_ORDER) ||
-              (avail[MEMZONE_DMADOM] <
+              (avail_heap_pages(MEMZONE_DMADOM,-1) <
                (lowmem_emergency_pool_pages + (1UL << order)))) )
             return NULL;
     }
 
     if ( pg == NULL )
-        if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, order)) == NULL )
+        if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, cpu, order)) == NULL )
             return NULL;
 
     mask = pg->u.free.cpumask;
@@ -615,6 +662,13 @@
     spin_unlock(&d->page_alloc_lock);
     
     return pg;
+}
+
+inline struct page_info *alloc_domheap_pages(
+    struct domain *d, unsigned int order, unsigned int flags)
+{
+    return __alloc_domheap_pages(d, smp_processor_id(), order, flags);
+
 }
 
 
@@ -690,13 +744,27 @@
 }
 
 
+unsigned long avail_heap_pages(int zone, int node)
+{
+    int i,j;
+    unsigned long free_pages = 0;
+   
+    for (i=0; i<NR_ZONES; i++)
+        if ( (zone == -1) || (zone == i) )
+            for (j=0; j<num_online_nodes(); j++)
+                if ( (node == -1) || (node == j) )
+                    free_pages += avail[i][j];            
+
+    return free_pages;
+}
+
 unsigned long avail_domheap_pages(void)
 {
     unsigned long avail_nrm, avail_dma;
-
-    avail_nrm = avail[MEMZONE_DOM];
-
-    avail_dma = avail[MEMZONE_DMADOM];
+    
+    avail_nrm = avail_heap_pages(MEMZONE_DOM,-1);
+
+    avail_dma = avail_heap_pages(MEMZONE_DMADOM,-1);
     if ( avail_dma > lowmem_emergency_pool_pages )
         avail_dma -= lowmem_emergency_pool_pages;
     else
@@ -705,6 +773,10 @@
     return avail_nrm + avail_dma;
 }
 
+unsigned long avail_nodeheap_pages(int node)
+{
+    return avail_heap_pages(-1, node);
+}
 
 static void pagealloc_keyhandler(unsigned char key)
 {
@@ -712,9 +784,9 @@
     printk("    Xen heap: %lukB free\n"
            "    DMA heap: %lukB free\n"
            "    Dom heap: %lukB free\n",
-           avail[MEMZONE_XEN]<<(PAGE_SHIFT-10),
-           avail[MEMZONE_DMADOM]<<(PAGE_SHIFT-10),
-           avail[MEMZONE_DOM]<<(PAGE_SHIFT-10));
+           avail_heap_pages(MEMZONE_XEN, -1) << (PAGE_SHIFT-10), 
+           avail_heap_pages(MEMZONE_DMADOM, -1) <<(PAGE_SHIFT-10), 
+           avail_heap_pages(MEMZONE_DOM, -1) <<(PAGE_SHIFT-10));
 }
 
 
@@ -776,6 +848,46 @@
     } while ( (NOW() - start) < MILLISECS(1) );
 }
 
+static unsigned long count_bucket(struct list_head* l, int order)
+{
+   unsigned long total_pages = 0;
+   int pages = 1 << order;
+   struct page_info *pg;
+
+   list_for_each_entry(pg, l, list)
+       total_pages += pages;
+
+   return total_pages;
+}
+
+static void dump_heap(unsigned char key)
+{
+    s_time_t       now = NOW();
+    int i,j,k;
+    unsigned long total;
+
+    printk("'%c' pressed -> dumping heap info (now-0x%X:%08X)\n", key,
+           (u32)(now>>32), (u32)now);
+
+    for (i=0; i<NR_ZONES; i++ )
+        for (j=0;j<MAX_NUMNODES;j++)
+            for (k=0;k<=MAX_ORDER;k++)
+                if ( !list_empty(&heap[i][j][k]) )
+                {
+                    total = count_bucket(&heap[i][j][k], k);
+                    printk("heap[%d][%d][%d]-> %lu pages\n",
+                           i, j, k, total);
+                }
+}
+
+static __init int register_heap_trigger(void)
+{
+   register_keyhandler('H', dump_heap, "dump heap info");
+   return 0;
+}
+__initcall(register_heap_trigger);
+
+
 static __init int page_scrub_init(void)
 {
     open_softirq(PAGE_SCRUB_SOFTIRQ, page_scrub_softirq);
diff -r 99e60f5df1d8 xen/include/xen/mm.h
--- a/xen/include/xen/mm.h      Wed May 31 15:36:41 2006
+++ b/xen/include/xen/mm.h      Wed May 31 14:10:01 2006
@@ -45,7 +45,8 @@
 /* Generic allocator. These functions are *not* interrupt-safe. */
 void init_heap_pages(
     unsigned int zone, struct page_info *pg, unsigned long nr_pages);
-struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order);
+struct page_info *alloc_heap_pages(
+    unsigned int zone, unsigned int cpu, unsigned int order);
 void free_heap_pages(
     unsigned int zone, struct page_info *pg, unsigned int order);
 void scrub_heap_pages(void);
@@ -61,8 +62,11 @@
 void init_domheap_pages(paddr_t ps, paddr_t pe);
 struct page_info *alloc_domheap_pages(
     struct domain *d, unsigned int order, unsigned int flags);
+struct page_info *__alloc_domheap_pages(
+    struct domain *d, unsigned int cpu, unsigned int order, unsigned int 
flags);
 void free_domheap_pages(struct page_info *pg, unsigned int order);
 unsigned long avail_domheap_pages(void);
+unsigned long avail_heap_pages(int zone, int node);
 #define alloc_domheap_page(d) (alloc_domheap_pages(d,0,0))
 #define free_domheap_page(p)  (free_domheap_pages(p,0))
 

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
WARNING - OLD ARCHIVES

xen-devel

Re: [Xen-devel] Re: [PATCH 2/6][RESEND] xen: Add NUMA support to Xen