* Keir Fraser <Keir.Fraser@xxxxxxxxxxxx> [2006-05-13 04:33]:
>
> On 12 May 2006, at 16:12, Ryan Harper wrote:
>
> >>Then in
> >>init_domheap_pages(), there are no calls to memguard, just
> >>work to set up the range for a call to init_heap_pages(). I'm
> >>not sure if I need to use memguard for marking the chunk
> >>boundaries, or if just reserving chunk boundaries that weren't
> >>already on a MAX_ORDER edge via map_alloc() is sufficient.
>
> Just reserving is sufficient. memguard is a debug-build aid for xen
> heap only.
>
> >>Also, I didn't see a way to ensure reserved pages aren't freed via a
> >>call to init_heap_pages() which just clears out a range of bits in
> >>the alloc map. Should we be worried about that?
>
> Well, in theory. It's only called in one place though right now, and
> probably with a physical range below 128MB in all cases. So it's
> unlikely to straddle a NUMA boundary.
OK.
>
> Another comment on this patch (2/2): page_to_node() should be defined
> in numa.c (for now) rather than page_alloc.c (where it will never
> belong). I know you also directly scan the chunk array to find
> boundaries to reserve: perhaps for now we could have
> end_boot_allocator() call init_heap_pages() for each page, then
> initheap_pages() can compare page_to_node(page) with
> page_to_node(page-1). If they differ and the latter is not -1 and page
> is not on a MAX_ORDER boundary, then you do not free the page to the
> buddy allocator.
>
> Clearly this will be crappily slow, but it's only used at boot time and
> as long as it's not *too* bad (which it certainly won't be for short
> chunk lists) then it'll do until we improve things (probably with a
> fast constant-time page_to_node() implementation).
I've re-worked this patch to use the new ACPI/x86_64 NUMA infrastructure
I just resent. Now page_alloc.c only uses MAX_NUMNODES and the
phys_to_nid() function. I've modifed end_boot_allocator/init_heap_pages
as described above.
--
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
(512) 838-9253 T/L: 678-9253
ryanh@xxxxxxxxxx
diffstat output:
common/page_alloc.c | 198 ++++++++++++++++++++++++++++++++++++++++------------
include/xen/mm.h | 6 +
2 files changed, 160 insertions(+), 44 deletions(-)
Signed-off-by: Ryan Harper <ryanh@xxxxxxxxxx>
---
diff -r 99e60f5df1d8 xen/common/page_alloc.c
--- a/xen/common/page_alloc.c Wed May 31 15:36:41 2006
+++ b/xen/common/page_alloc.c Wed May 31 14:10:01 2006
@@ -4,6 +4,7 @@
* Simple buddy heap allocator for Xen.
*
* Copyright (c) 2002-2004 K A Fraser
+ * Copyright (c) 2006 IBM Ryan Harper <ryanh@xxxxxxxxxx>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -33,6 +34,7 @@
#include <xen/shadow.h>
#include <xen/domain_page.h>
#include <xen/keyhandler.h>
+#include <asm/numa.h>
#include <asm/page.h>
/*
@@ -246,22 +248,23 @@
#define pfn_dom_zone_type(_pfn) \
(((_pfn) <= MAX_DMADOM_PFN) ? MEMZONE_DMADOM : MEMZONE_DOM)
-static struct list_head heap[NR_ZONES][MAX_ORDER+1];
-
-static unsigned long avail[NR_ZONES];
+static struct list_head heap[NR_ZONES][MAX_NUMNODES][MAX_ORDER+1];
+
+static unsigned long avail[NR_ZONES][MAX_NUMNODES];
static spinlock_t heap_lock = SPIN_LOCK_UNLOCKED;
void end_boot_allocator(void)
{
- unsigned long i, j;
+ unsigned long i, j, k;
int curr_free = 0, next_free = 0;
memset(avail, 0, sizeof(avail));
for ( i = 0; i < NR_ZONES; i++ )
- for ( j = 0; j <= MAX_ORDER; j++ )
- INIT_LIST_HEAD(&heap[i][j]);
+ for ( j = 0; j < MAX_NUMNODES; j++ )
+ for ( k = 0; k <= MAX_ORDER; k++ )
+ INIT_LIST_HEAD(&heap[i][j][k]);
/* Pages that are free now go to the domain sub-allocator. */
for ( i = 0; i < max_page; i++ )
@@ -271,29 +274,58 @@
if ( next_free )
map_alloc(i+1, 1); /* prevent merging in free_heap_pages() */
if ( curr_free )
- free_heap_pages(pfn_dom_zone_type(i), mfn_to_page(i), 0);
- }
-}
-
-/* Hand the specified arbitrary page range to the specified heap zone. */
+ init_heap_pages(pfn_dom_zone_type(i), mfn_to_page(i), 1);
+ }
+}
+
+/*
+ * Hand the specified arbitrary page range to the specified heap zone
+ * checking the node_id of the previous page. If they differ and the
+ * latter is not on a MAX_ORDER boundary, then we reserve the page by
+ * not freeing it to the buddy allocator.
+ */
+#define MAX_ORDER_ALIGNED (1UL << (MAX_ORDER))
void init_heap_pages(
unsigned int zone, struct page_info *pg, unsigned long nr_pages)
{
+ unsigned int nid_curr,nid_prev;
unsigned long i;
ASSERT(zone < NR_ZONES);
+ if ( likely(page_to_mfn(pg) != 0) )
+ nid_prev = phys_to_nid(page_to_maddr(pg-1));
+ else
+ nid_prev = phys_to_nid(page_to_maddr(pg));
+
for ( i = 0; i < nr_pages; i++ )
- free_heap_pages(zone, pg+i, 0);
-}
-
+ {
+ nid_curr = phys_to_nid(page_to_maddr(pg+i));
+
+ /*
+ * free pages of the same node, or if they differ, but are on a
+ * MAX_ORDER alignement boundary (which already get reserved)
+ */
+ if ( (nid_curr == nid_prev) || (page_to_maddr(pg+i) &
+ MAX_ORDER_ALIGNED) )
+ free_heap_pages(zone, pg+i, 0);
+ else
+ printk("Reserving non-aligned node boundary @ mfn %lu\n",
+ page_to_mfn(pg+i));
+
+ nid_prev = nid_curr;
+ }
+}
/* Allocate 2^@order contiguous pages. */
-struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order)
-{
- int i;
+struct page_info *alloc_heap_pages(unsigned int zone, unsigned int cpu,
+ unsigned int order)
+{
+ int i,j, node;
struct page_info *pg;
+ ASSERT(cpu_to_node[cpu] >= 0);
+ ASSERT(cpu_to_node[cpu] < num_online_nodes());
ASSERT(zone < NR_ZONES);
if ( unlikely(order > MAX_ORDER) )
@@ -301,29 +333,38 @@
spin_lock(&heap_lock);
- /* Find smallest order which can satisfy the request. */
- for ( i = order; i <= MAX_ORDER; i++ )
- if ( !list_empty(&heap[zone][i]) )
- goto found;
+ /* start with requested node, but exhaust all node memory
+ * in requested zone before failing */
+ for ( i = 0; i < num_online_nodes(); i++ )
+ {
+ node = (cpu_to_node[cpu]+i) % num_online_nodes();
+ /* Find smallest order which can satisfy the request. */
+ for ( j = order; j <= MAX_ORDER; j++ )
+ {
+ if ( !list_empty(&heap[zone][node][j]) )
+ goto found;
+ }
+ }
/* No suitable memory blocks. Fail the request. */
spin_unlock(&heap_lock);
return NULL;
found:
- pg = list_entry(heap[zone][i].next, struct page_info, list);
+ pg = list_entry(heap[zone][node][j].next, struct page_info, list);
list_del(&pg->list);
/* We may have to halve the chunk a number of times. */
- while ( i != order )
- {
- PFN_ORDER(pg) = --i;
- list_add_tail(&pg->list, &heap[zone][i]);
- pg += 1 << i;
+ while ( j != order )
+ {
+ PFN_ORDER(pg) = --j;
+ list_add_tail(&pg->list, &heap[zone][node][j]);
+ pg += 1 << j;
}
map_alloc(page_to_mfn(pg), 1 << order);
- avail[zone] -= 1 << order;
+ ASSERT(avail[zone][node] >= (1 << order));
+ avail[zone][node] -= 1 << order;
spin_unlock(&heap_lock);
@@ -336,14 +377,17 @@
unsigned int zone, struct page_info *pg, unsigned int order)
{
unsigned long mask;
+ int node = phys_to_nid(page_to_maddr(pg));
ASSERT(zone < NR_ZONES);
ASSERT(order <= MAX_ORDER);
+ ASSERT(node >= 0);
+ ASSERT(node < num_online_nodes());
spin_lock(&heap_lock);
map_free(page_to_mfn(pg), 1 << order);
- avail[zone] += 1 << order;
+ avail[zone][node] += 1 << order;
/* Merge chunks as far as possible. */
while ( order < MAX_ORDER )
@@ -369,10 +413,13 @@
}
order++;
+
+ /* after merging, pg should be in the same node */
+ ASSERT(phys_to_nid(page_to_maddr(pg)) == node );
}
PFN_ORDER(pg) = order;
- list_add_tail(&pg->list, &heap[zone][order]);
+ list_add_tail(&pg->list, &heap[zone][node][order]);
spin_unlock(&heap_lock);
}
@@ -467,7 +514,7 @@
int i;
local_irq_save(flags);
- pg = alloc_heap_pages(MEMZONE_XEN, order);
+ pg = alloc_heap_pages(MEMZONE_XEN, smp_processor_id(), order);
local_irq_restore(flags);
if ( unlikely(pg == NULL) )
@@ -531,8 +578,8 @@
}
-struct page_info *alloc_domheap_pages(
- struct domain *d, unsigned int order, unsigned int flags)
+struct page_info *__alloc_domheap_pages(
+ struct domain *d, unsigned int cpu, unsigned int order, unsigned int flags)
{
struct page_info *pg = NULL;
cpumask_t mask;
@@ -542,17 +589,17 @@
if ( !(flags & ALLOC_DOM_DMA) )
{
- pg = alloc_heap_pages(MEMZONE_DOM, order);
+ pg = alloc_heap_pages(MEMZONE_DOM, cpu, order);
/* Failure? Then check if we can fall back to the DMA pool. */
if ( unlikely(pg == NULL) &&
((order > MAX_ORDER) ||
- (avail[MEMZONE_DMADOM] <
+ (avail_heap_pages(MEMZONE_DMADOM,-1) <
(lowmem_emergency_pool_pages + (1UL << order)))) )
return NULL;
}
if ( pg == NULL )
- if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, order)) == NULL )
+ if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, cpu, order)) == NULL )
return NULL;
mask = pg->u.free.cpumask;
@@ -615,6 +662,13 @@
spin_unlock(&d->page_alloc_lock);
return pg;
+}
+
+inline struct page_info *alloc_domheap_pages(
+ struct domain *d, unsigned int order, unsigned int flags)
+{
+ return __alloc_domheap_pages(d, smp_processor_id(), order, flags);
+
}
@@ -690,13 +744,27 @@
}
+unsigned long avail_heap_pages(int zone, int node)
+{
+ int i,j;
+ unsigned long free_pages = 0;
+
+ for (i=0; i<NR_ZONES; i++)
+ if ( (zone == -1) || (zone == i) )
+ for (j=0; j<num_online_nodes(); j++)
+ if ( (node == -1) || (node == j) )
+ free_pages += avail[i][j];
+
+ return free_pages;
+}
+
unsigned long avail_domheap_pages(void)
{
unsigned long avail_nrm, avail_dma;
-
- avail_nrm = avail[MEMZONE_DOM];
-
- avail_dma = avail[MEMZONE_DMADOM];
+
+ avail_nrm = avail_heap_pages(MEMZONE_DOM,-1);
+
+ avail_dma = avail_heap_pages(MEMZONE_DMADOM,-1);
if ( avail_dma > lowmem_emergency_pool_pages )
avail_dma -= lowmem_emergency_pool_pages;
else
@@ -705,6 +773,10 @@
return avail_nrm + avail_dma;
}
+unsigned long avail_nodeheap_pages(int node)
+{
+ return avail_heap_pages(-1, node);
+}
static void pagealloc_keyhandler(unsigned char key)
{
@@ -712,9 +784,9 @@
printk(" Xen heap: %lukB free\n"
" DMA heap: %lukB free\n"
" Dom heap: %lukB free\n",
- avail[MEMZONE_XEN]<<(PAGE_SHIFT-10),
- avail[MEMZONE_DMADOM]<<(PAGE_SHIFT-10),
- avail[MEMZONE_DOM]<<(PAGE_SHIFT-10));
+ avail_heap_pages(MEMZONE_XEN, -1) << (PAGE_SHIFT-10),
+ avail_heap_pages(MEMZONE_DMADOM, -1) <<(PAGE_SHIFT-10),
+ avail_heap_pages(MEMZONE_DOM, -1) <<(PAGE_SHIFT-10));
}
@@ -776,6 +848,46 @@
} while ( (NOW() - start) < MILLISECS(1) );
}
+static unsigned long count_bucket(struct list_head* l, int order)
+{
+ unsigned long total_pages = 0;
+ int pages = 1 << order;
+ struct page_info *pg;
+
+ list_for_each_entry(pg, l, list)
+ total_pages += pages;
+
+ return total_pages;
+}
+
+static void dump_heap(unsigned char key)
+{
+ s_time_t now = NOW();
+ int i,j,k;
+ unsigned long total;
+
+ printk("'%c' pressed -> dumping heap info (now-0x%X:%08X)\n", key,
+ (u32)(now>>32), (u32)now);
+
+ for (i=0; i<NR_ZONES; i++ )
+ for (j=0;j<MAX_NUMNODES;j++)
+ for (k=0;k<=MAX_ORDER;k++)
+ if ( !list_empty(&heap[i][j][k]) )
+ {
+ total = count_bucket(&heap[i][j][k], k);
+ printk("heap[%d][%d][%d]-> %lu pages\n",
+ i, j, k, total);
+ }
+}
+
+static __init int register_heap_trigger(void)
+{
+ register_keyhandler('H', dump_heap, "dump heap info");
+ return 0;
+}
+__initcall(register_heap_trigger);
+
+
static __init int page_scrub_init(void)
{
open_softirq(PAGE_SCRUB_SOFTIRQ, page_scrub_softirq);
diff -r 99e60f5df1d8 xen/include/xen/mm.h
--- a/xen/include/xen/mm.h Wed May 31 15:36:41 2006
+++ b/xen/include/xen/mm.h Wed May 31 14:10:01 2006
@@ -45,7 +45,8 @@
/* Generic allocator. These functions are *not* interrupt-safe. */
void init_heap_pages(
unsigned int zone, struct page_info *pg, unsigned long nr_pages);
-struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order);
+struct page_info *alloc_heap_pages(
+ unsigned int zone, unsigned int cpu, unsigned int order);
void free_heap_pages(
unsigned int zone, struct page_info *pg, unsigned int order);
void scrub_heap_pages(void);
@@ -61,8 +62,11 @@
void init_domheap_pages(paddr_t ps, paddr_t pe);
struct page_info *alloc_domheap_pages(
struct domain *d, unsigned int order, unsigned int flags);
+struct page_info *__alloc_domheap_pages(
+ struct domain *d, unsigned int cpu, unsigned int order, unsigned int
flags);
void free_domheap_pages(struct page_info *pg, unsigned int order);
unsigned long avail_domheap_pages(void);
+unsigned long avail_heap_pages(int zone, int node);
#define alloc_domheap_page(d) (alloc_domheap_pages(d,0,0))
#define free_domheap_page(p) (free_domheap_pages(p,0))
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|