numa: select nodes by cpu affinity Along the same theme as changeset 21719, but expanded to all nodes from which the domain is using processors. Don't be as strict as exact_node_request, but rather just fall back to the current behavior on failure. This should help performance if cpu affinity is selected by numa-aware tools. diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c --- a/xen/common/page_alloc.c +++ b/xen/common/page_alloc.c @@ -292,17 +292,47 @@ return needed; } +static void get_nodemask_by_cpu_affinity( + const struct domain *d, nodemask_t *nodemask) +{ + cpumask_t cpumask = CPU_MASK_NONE; + struct vcpu *v; + unsigned int node; + + nodes_clear(*nodemask); + + if ( d == NULL || num_online_nodes() == 1 ) + goto all_online_nodes; + + for_each_vcpu(d, v) + cpus_or(cpumask, cpumask, v->cpu_affinity); + + if ( cpus_subset(cpu_online_map, cpumask) ) + goto all_online_nodes; + + for_each_online_node(node) + if ( cpus_intersects(node_to_cpumask(node), cpumask) ) + node_set(node, *nodemask); + return; + +all_online_nodes: + nodes_or(*nodemask, *nodemask, node_online_map); + return; +} + /* Allocate 2^@order contiguous pages. */ static struct page_info *alloc_heap_pages( unsigned int zone_lo, unsigned int zone_hi, - unsigned int node, unsigned int order, unsigned int memflags) + unsigned int node, unsigned int order, unsigned int memflags, + nodemask_t nodemask) { unsigned int i, j, zone = 0; - unsigned int num_nodes = num_online_nodes(); + unsigned int num_nodes; unsigned long request = 1UL << order; bool_t exact_node_request = !!(memflags & MEMF_exact_node); cpumask_t extra_cpus_mask, mask; struct page_info *pg; + int nodemask_retry = 1; if ( node == NUMA_NO_NODE ) { @@ -335,6 +365,15 @@ * zone before failing, only calc new node value if we fail to find memory * in target node, this avoids needless computation on fast-path. */ + if ( exact_node_request ) + num_nodes = 1; + else + { + nodes_and(nodemask, nodemask, node_online_map); + num_nodes = nodes_weight(nodemask); + } + +try_nodemask: for ( i = 0; i < num_nodes; i++ ) { zone = zone_hi; @@ -353,9 +392,17 @@ goto not_found; /* Pick next node, wrapping around if needed. */ - node = next_node(node, node_online_map); + node = next_node(node, nodemask); if (node == MAX_NUMNODES) - node = first_node(node_online_map); + node = first_node(nodemask); + } + + if ( nodemask_retry-- && !nodes_equal(nodemask, node_online_map) ) + { + nodes_andnot(nodemask, node_online_map, nodemask); + num_nodes = nodes_weight(nodemask); + node = first_node(nodemask); + goto try_nodemask; } try_tmem: @@ -1010,7 +1057,7 @@ ASSERT(!in_irq()); pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN, - cpu_to_node(smp_processor_id()), order, memflags); + cpu_to_node(smp_processor_id()), order, memflags, node_online_map); if ( unlikely(pg == NULL) ) return NULL; @@ -1154,6 +1201,7 @@ struct page_info *pg = NULL; unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1; unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1), dma_zone; + nodemask_t nodemask; ASSERT(!in_irq()); @@ -1164,13 +1212,16 @@ if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 ) return NULL; + get_nodemask_by_cpu_affinity(d, &nodemask); + if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) ) - pg = alloc_heap_pages(dma_zone + 1, zone_hi, node, order, memflags); + pg = alloc_heap_pages(dma_zone + 1, zone_hi, node, order, memflags, + nodemask); if ( (pg == NULL) && ((memflags & MEMF_no_dma) || ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi, - node, order, memflags)) == NULL)) ) + node, order, memflags, nodemask)) == NULL)) ) return NULL; if ( (d != NULL) && assign_pages(d, pg, order, memflags) )