WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [Patch 1/6] Cpupools: hypervisor part

To: "xen-devel@xxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxx>
Subject: [Xen-devel] [Patch 1/6] Cpupools: hypervisor part
From: Juergen Gross <juergen.gross@xxxxxxxxxxxxxx>
Date: Tue, 20 Apr 2010 11:38:43 +0200
Delivery-date: Tue, 20 Apr 2010 02:40:29 -0700
Dkim-signature: v=1; a=rsa-sha256; c=simple/simple; d=ts.fujitsu.com; i=juergen.gross@xxxxxxxxxxxxxx; q=dns/txt; s=s1536b; t=1271756162; x=1303292162; h=message-id:date:from:mime-version:to:subject; z=Message-ID:=20<4BCD7623.1030507@xxxxxxxxxxxxxx>|Date:=20 Tue,=2020=20Apr=202010=2011:38:43=20+0200|From:=20Juergen =20Gross=20<juergen.gross@xxxxxxxxxxxxxx>|MIME-Version: =201.0|To:=20"xen-devel@xxxxxxxxxxxxxxxxxxx"=20<xen-devel @lists.xensource.com>|Subject:=20[Patch=201/6]=20Cpupools :=20hypervisor=20part; bh=VMWrLN9c1I004260QQC4oHxycghpWyntNscnEYFXF8Q=; b=dcEqo17t4aqSlxzL0U04XeuQCZvnElvKrRRu62R6ikW+EQxFVtal24tT fwL/zB2IUON/RqOObFbGEkeoW3mUMmE2i5Pmr0RbhHB0Sfm7HDKo7uBsZ SE3PLBhezyqzI6Pm0Th6y9kF84kKvSHbpNctkGp6D5iX/WlXfrr0Oe8Q5 WZ+4fAM1Akfcu4S+daX/Z1MKYwI2t5DSQaBCoAcCscNqvBN3GxgfyJWB6 F+yeZF0ZCTpHUQyfcZ5+Vb9Xs300N;
Domainkey-signature: s=s1536a; d=ts.fujitsu.com; c=nofws; q=dns; h=X-SBRSScore:X-IronPort-AV:Received:X-IronPort-AV: Received:Received:Message-ID:Date:From:Organization: User-Agent:MIME-Version:To:Subject:X-Enigmail-Version: Content-Type; b=eH59Kl0hDJ/r/TBSNa9ozsmkGm3yMUV8Q9kFXYBIq2U7wCTxUjYXYsjC svLD7rHDXigoyax8nz/cGfybdJ3+A/KNddOxbQs+iXTboyUiAZA/QVwAq 3qL7/PjrVsupbGbq4Ob4g2IHsI5QwksCHrI8yEq8gFIdRU+5bdJnMK1Li KjZ7Z7KDTT1U2wTBpqYGxswPy9BPOPMQke/up2VpibNTT7rlMKLGR8HyR Tj6h1y+acnjzJugb/J6j8s60o2uaf;
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Organization: Fujitsu Technology Solutions
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
User-agent: Mozilla-Thunderbird 2.0.0.24 (X11/20100329)
-- 
Juergen Gross                 Principal Developer Operating Systems
TSP ES&S SWE OS6                       Telephone: +49 (0) 89 3222 2967
Fujitsu Technology Solutions              e-mail: juergen.gross@xxxxxxxxxxxxxx
Domagkstr. 28                           Internet: ts.fujitsu.com
D-80807 Muenchen                 Company details: ts.fujitsu.com/imprint.html
Signed-off-by: juergen.gross@xxxxxxxxxxxxxx

diff -r fadf63ab49e7 xen/arch/x86/domain_build.c
--- a/xen/arch/x86/domain_build.c       Mon Apr 19 17:57:28 2010 +0100
+++ b/xen/arch/x86/domain_build.c       Tue Apr 20 11:10:40 2010 +0200
@@ -9,6 +9,7 @@
 #include <xen/lib.h>
 #include <xen/ctype.h>
 #include <xen/sched.h>
+#include <xen/sched-if.h>
 #include <xen/smp.h>
 #include <xen/delay.h>
 #include <xen/event.h>
@@ -84,7 +85,7 @@ struct vcpu *__init alloc_dom0_vcpu0(voi
 struct vcpu *__init alloc_dom0_vcpu0(void)
 {
     if ( opt_dom0_max_vcpus == 0 )
-        opt_dom0_max_vcpus = num_online_cpus();
+        opt_dom0_max_vcpus = num_cpupool_cpus(cpupool0);
     if ( opt_dom0_max_vcpus > MAX_VIRT_CPUS )
         opt_dom0_max_vcpus = MAX_VIRT_CPUS;
 
@@ -277,7 +278,7 @@ int __init construct_dom0(
     unsigned long _initrd_start, unsigned long initrd_len,
     char *cmdline)
 {
-    int i, rc, compatible, compat32, order, machine;
+    int i, cpu, rc, compatible, compat32, order, machine;
     struct cpu_user_regs *regs;
     unsigned long pfn, mfn;
     unsigned long nr_pages;
@@ -776,8 +777,12 @@ int __init construct_dom0(
 
     printk("Dom0 has maximum %u VCPUs\n", opt_dom0_max_vcpus);
 
+    cpu = first_cpu(cpupool0->cpu_valid);
     for ( i = 1; i < opt_dom0_max_vcpus; i++ )
-        (void)alloc_vcpu(d, i, i % num_online_cpus());
+    {
+        cpu = cycle_cpu(cpu, cpupool0->cpu_valid);
+        (void)alloc_vcpu(d, i, cpu);
+    }
 
     /* Set up CR3 value for write_ptbase */
     if ( paging_mode_enabled(d) )
diff -r fadf63ab49e7 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Mon Apr 19 17:57:28 2010 +0100
+++ b/xen/arch/x86/mm.c Tue Apr 20 11:10:40 2010 +0200
@@ -242,7 +242,7 @@ void __init arch_init_memory(void)
      * Any Xen-heap pages that we will allow to be mapped will have
      * their domain field set to dom_xen.
      */
-    dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
+    dom_xen = domain_create(DOMID_XEN, CPUPOOLID_NONE, DOMCRF_dummy, 0);
     BUG_ON(dom_xen == NULL);
 
     /*
@@ -250,14 +250,14 @@ void __init arch_init_memory(void)
      * This domain owns I/O pages that are within the range of the page_info
      * array. Mappings occur at the priv of the caller.
      */
-    dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
+    dom_io = domain_create(DOMID_IO, CPUPOOLID_NONE, DOMCRF_dummy, 0);
     BUG_ON(dom_io == NULL);
     
     /*
      * Initialise our DOMID_IO domain.
      * This domain owns sharable pages.
      */
-    dom_cow = domain_create(DOMID_COW, DOMCRF_dummy, 0);
+    dom_cow = domain_create(DOMID_COW, CPUPOOLID_NONE, DOMCRF_dummy, 0);
     BUG_ON(dom_cow == NULL);
 
     /* First 1MB of RAM is historically marked as I/O. */
diff -r fadf63ab49e7 xen/arch/x86/platform_hypercall.c
--- a/xen/arch/x86/platform_hypercall.c Mon Apr 19 17:57:28 2010 +0100
+++ b/xen/arch/x86/platform_hypercall.c Tue Apr 20 11:10:40 2010 +0200
@@ -19,6 +19,7 @@
 #include <xen/iocap.h>
 #include <xen/guest_access.h>
 #include <xen/acpi.h>
+#include <xen/sched-if.h>
 #include <asm/current.h>
 #include <public/platform.h>
 #include <acpi/cpufreq/processor_perf.h>
@@ -407,7 +408,7 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
         g_info = &op->u.pcpu_info;
 
         /* spin_trylock() avoids deadlock with stop_machine_run(). */
-        if ( !spin_trylock(&cpu_add_remove_lock) )
+        if ( !spin_trylock(&cpupool_lock) )
         {
             ret = -EBUSY;
             break;
@@ -430,7 +431,7 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
 
         g_info->max_present = last_cpu(cpu_present_map);
 
-        spin_unlock(&cpu_add_remove_lock);
+        spin_unlock(&cpupool_lock);
 
         ret = copy_to_guest(u_xenpf_op, op, 1) ? -EFAULT : 0;
     }
diff -r fadf63ab49e7 xen/arch/x86/setup.c
--- a/xen/arch/x86/setup.c      Mon Apr 19 17:57:28 2010 +0100
+++ b/xen/arch/x86/setup.c      Tue Apr 20 11:10:40 2010 +0200
@@ -2,6 +2,7 @@
 #include <xen/init.h>
 #include <xen/lib.h>
 #include <xen/sched.h>
+#include <xen/sched-if.h>
 #include <xen/domain.h>
 #include <xen/serial.h>
 #include <xen/softirq.h>
@@ -245,7 +246,7 @@ static void __init init_idle_domain(void
     /* Domain creation requires that scheduler structures are initialised. */
     scheduler_init();
 
-    idle_domain = domain_create(IDLE_DOMAIN_ID, 0, 0);
+    idle_domain = domain_create(IDLE_DOMAIN_ID, CPUPOOLID_NONE, 0, 0);
     if ( idle_domain == NULL )
         BUG();
     idle_domain->vcpu = idle_vcpu;
@@ -1093,8 +1094,13 @@ void __init __start_xen(unsigned long mb
     if ( !tboot_protect_mem_regions() )
         panic("Could not protect TXT memory regions\n");
 
+    /* Create initial cpupool 0. */
+    cpupool0 = cpupool_create(0, NULL);
+    if ( (cpupool0 == NULL) || cpupool0_cpu_assign(cpupool0) )
+        panic("Error creating cpupool 0\n");
+
     /* Create initial domain 0. */
-    dom0 = domain_create(0, DOMCRF_s3_integrity, DOM0_SSIDREF);
+    dom0 = domain_create(0, 0, DOMCRF_s3_integrity, DOM0_SSIDREF);
     if ( (dom0 == NULL) || (alloc_dom0_vcpu0() == NULL) )
         panic("Error creating domain 0\n");
 
diff -r fadf63ab49e7 xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c    Mon Apr 19 17:57:28 2010 +0100
+++ b/xen/arch/x86/smpboot.c    Tue Apr 20 11:10:40 2010 +0200
@@ -39,6 +39,7 @@
 #include <xen/mm.h>
 #include <xen/domain.h>
 #include <xen/sched.h>
+#include <xen/sched-if.h>
 #include <xen/irq.h>
 #include <xen/delay.h>
 #include <xen/softirq.h>
@@ -105,7 +106,6 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 };
 DEFINE_PER_CPU(int, cpu_state) = { 0 };
 
 void *stack_base[NR_CPUS];
-DEFINE_SPINLOCK(cpu_add_remove_lock);
 
 /*
  * The bootstrap kernel entry code has set these up. Save them for
@@ -822,7 +822,7 @@ wakeup_secondary_cpu(int phys_apicid, un
 
 extern cpumask_t cpu_initialized;
 /*
- * Caller should hold cpu_add_remove_lock if not called when booting
+ * Caller should hold cpupool_lock if not called when booting
  */
 int alloc_cpu_id(void)
 {
@@ -986,8 +986,10 @@ static int __devinit do_boot_cpu(int api
                cpucount--;
 
                /* Mark the CPU as non-present */
+               spin_lock(&cpupool_lock);
                x86_cpu_to_apicid[cpu] = BAD_APICID;
                cpu_clear(cpu, cpu_present_map);
+               spin_unlock(&cpupool_lock);
        } else {
        }
 
@@ -1307,10 +1309,11 @@ int __cpu_disable(void)
        sync_local_execstate();
 
        /* It's now safe to remove this processor from the online map */
+       cpu_clear(cpu, cpupool0->cpu_valid);
        cpu_clear(cpu, cpu_online_map);
        fixup_irqs();
 
-       cpu_disable_scheduler();
+       cpu_disable_scheduler(cpu, 0);
 
        return 0;
 }
@@ -1344,10 +1347,9 @@ int cpu_down(unsigned int cpu)
        int err = 0;
 
        /* spin_trylock() avoids deadlock with stop_machine_run(). */
-       if (!spin_trylock(&cpu_add_remove_lock))
+       if (!spin_trylock(&cpupool_lock))
                return -EBUSY;
-
-       if (num_online_cpus() == 1) {
+       if ((!cpu_isset(cpu, cpupool0->cpu_valid)) || 
(cpus_weight(cpupool0->cpu_valid) == 1)) {
                err = -EBUSY;
                goto out;
        }
@@ -1381,7 +1383,7 @@ out:
 out:
        if (!err)
                send_guest_global_virq(dom0, VIRQ_PCPU_STATE);
-       spin_unlock(&cpu_add_remove_lock);
+       spin_unlock(&cpupool_lock);
        return err;
 }
 
@@ -1390,7 +1392,7 @@ int cpu_up(unsigned int cpu)
        int err = 0;
 
        /* spin_trylock() avoids deadlock with stop_machine_run(). */
-       if (!spin_trylock(&cpu_add_remove_lock))
+       if (!spin_trylock(&cpupool_lock))
            return -EBUSY;
 
        if (cpu_online(cpu)) {
@@ -1408,7 +1410,7 @@ out:
 out:
        if (!err)
                send_guest_global_virq(dom0, VIRQ_PCPU_STATE);
-       spin_unlock(&cpu_add_remove_lock);
+       spin_unlock(&cpupool_lock);
        return err;
 }
 
@@ -1494,14 +1496,14 @@ int cpu_add(uint32_t apic_id, uint32_t a
                return -EEXIST;
 
        /* spin_trylock() avoids deadlock with stop_machine_run(). */
-       if (!spin_trylock(&cpu_add_remove_lock))
+       if (!spin_trylock(&cpupool_lock))
                return -EBUSY;
 
        cpu = mp_register_lapic(apic_id, 1);
 
        if (cpu < 0)
        {
-               spin_unlock(&cpu_add_remove_lock);
+               spin_unlock(&cpupool_lock);
                return cpu;
        }
 
@@ -1518,7 +1520,7 @@ int cpu_add(uint32_t apic_id, uint32_t a
                                "Setup node failed for pxm %x\n", pxm);
                        x86_acpiid_to_apicid[acpi_id] = 0xff;
                        mp_unregister_lapic(apic_id, cpu);
-                       spin_unlock(&cpu_add_remove_lock);
+                       spin_unlock(&cpupool_lock);
                        return node;
                }
                apicid_to_node[apic_id] = node;
@@ -1526,7 +1528,7 @@ int cpu_add(uint32_t apic_id, uint32_t a
 
        srat_detect_node(cpu);
        numa_add_cpu(cpu);
-       spin_unlock(&cpu_add_remove_lock);
+       spin_unlock(&cpupool_lock);
        dprintk(XENLOG_INFO, "Add CPU %x with index %x\n", apic_id, cpu);
        return cpu;
 }
@@ -1570,6 +1572,7 @@ int __devinit __cpu_up(unsigned int cpu)
                process_pending_softirqs();
        }
 
+       cpupool_cpu_add(cpu);
        cpufreq_add_cpu(cpu);
        return 0;
 }
diff -r fadf63ab49e7 xen/common/Makefile
--- a/xen/common/Makefile       Mon Apr 19 17:57:28 2010 +0100
+++ b/xen/common/Makefile       Tue Apr 20 11:10:40 2010 +0200
@@ -1,5 +1,6 @@ obj-y += bitmap.o
 obj-y += bitmap.o
 obj-y += cpu.o
+obj-y += cpupool.o
 obj-y += domctl.o
 obj-y += domain.o
 obj-y += event_channel.o
diff -r fadf63ab49e7 xen/common/cpupool.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/common/cpupool.c      Tue Apr 20 11:10:40 2010 +0200
@@ -0,0 +1,580 @@
+/******************************************************************************
+ * cpupool.c
+ * 
+ * Generic cpupool-handling functions.
+ *
+ * Cpupools are a feature to have configurable scheduling domains. Each
+ * cpupool runs an own scheduler on a dedicated set of physical cpus.
+ * A domain is bound to one cpupool at any time, but it can be moved to
+ * another cpupool.
+ *
+ * (C) 2009, Juergen Gross, Fujitsu Technology Solutions
+ */
+
+#include <xen/lib.h>
+#include <xen/init.h>
+#include <xen/cpumask.h>
+#include <xen/percpu.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+
+#define for_each_cpupool(ptr)    \
+    for ((ptr) = &cpupool_list; *(ptr) != NULL; (ptr) = &((*(ptr))->next))
+
+struct cpupool *cpupool0;                /* Initial cpupool with Dom0 */
+cpumask_t cpupool_free_cpus;             /* cpus not in any cpupool */
+
+static struct cpupool *cpupool_list;     /* linked list, sorted by poolid */
+
+static int cpupool0_max_cpus;
+integer_param("pool0_max_cpus", cpupool0_max_cpus);
+
+static int cpupool_moving_cpu = -1;
+static struct cpupool *cpupool_cpu_moving = NULL;
+
+/* cpupool lock: be carefull, this lock is sometimes released on another cpu
+ *               as it was obtained!
+ */
+DEFINE_SPINLOCK(cpupool_lock);
+
+DEFINE_PER_CPU(struct cpupool *, cpupool);
+
+static struct cpupool *alloc_cpupool_struct(void)
+{
+    return xmalloc(struct cpupool);
+}
+
+static void free_cpupool_struct(struct cpupool *c)
+{
+    xfree(c);
+}
+
+/*
+ * find a cpupool by it's id. to be called with cpupool lock held
+ * if exact is not specified, the first cpupool with an id larger or equal to
+ * the searched id is returned
+ * returns NULL if not found.
+ */
+static struct cpupool *cpupool_find_by_id(int id, int exact)
+{
+    struct cpupool **q;
+
+    for_each_cpupool(q)
+    {
+        if ( (*q)->cpupool_id == id )
+            return *q;
+        if ( (*q)->cpupool_id > id )
+            break;
+    }
+    return exact ? NULL : *q;
+}
+
+/*
+ * create a new cpupool with specified poolid and scheduler
+ * returns pointer to new cpupool structure if okay, NULL else
+ * possible failures:
+ * - no memory
+ * - poolid already used
+ * - unknown scheduler
+ */
+struct cpupool *cpupool_create(int poolid, char *sched)
+{
+    struct cpupool *c;
+    struct cpupool **q;
+    int last = 0;
+
+    if ( (c = alloc_cpupool_struct()) == NULL )
+        return NULL;
+    memset(c, 0, sizeof(*c));
+
+    printk(XENLOG_DEBUG "cpupool_create(pool=%d,sched=%s)\n", poolid, sched);
+    spin_lock(&cpupool_lock);
+    for_each_cpupool(q)
+    {
+        last = (*q)->cpupool_id;
+        if ( (poolid != CPUPOOLID_NONE) && (last >= poolid) )
+            break;
+    }
+    if ( *q != NULL )
+    {
+        if ( (*q)->cpupool_id == poolid )
+        {
+            spin_unlock(&cpupool_lock);
+            free_cpupool_struct(c);
+            return NULL;
+        }
+        c->next = *q;
+    }
+    *q = c;
+    c->cpupool_id = (poolid == CPUPOOLID_NONE) ? (last + 1) : poolid;
+    if ( schedule_init_global(sched, &(c->sched)) )
+    {
+        spin_unlock(&cpupool_lock);
+        cpupool_destroy(c);
+        return NULL;
+    }
+    spin_unlock(&cpupool_lock);
+
+    printk("Created cpupool %d with scheduler %s (%s)\n", c->cpupool_id,
+        c->sched.name, c->sched.opt_name);
+
+    return c;
+}
+/*
+ * destroys the given cpupool
+ * returns 0 on success, 1 else
+ * possible failures:
+ * - pool still in use
+ * - cpus still assigned to pool
+ * - pool not in list
+ */
+int cpupool_destroy(struct cpupool *c)
+{
+    struct cpupool **q;
+
+    spin_lock(&cpupool_lock);
+    for_each_cpupool(q)
+        if ( *q == c )
+            break;
+    if ( (*q != c) || (c->n_dom != 0) || cpus_weight(c->cpu_valid) )
+    {
+        spin_unlock(&cpupool_lock);
+        return 1;
+    }
+    *q = c->next;
+    spin_unlock(&cpupool_lock);
+    printk(XENLOG_DEBUG "cpupool_destroy(pool=%d)\n", c->cpupool_id);
+    schedule_deinit_global(&(c->sched));
+    free_cpupool_struct(c);
+    return 0;
+}
+
+/*
+ * assign a specific cpu to a cpupool
+ * cpupool_lock must be held
+ */
+static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu)
+{
+    if ( (cpupool_moving_cpu == cpu) && (c != cpupool_cpu_moving) )
+        return -EBUSY;
+    per_cpu(cpupool, cpu) = c;
+    schedule_cpu_switch(cpu, c);
+    cpu_clear(cpu, cpupool_free_cpus);
+    if (cpupool_moving_cpu == cpu)
+    {
+        cpupool_moving_cpu = -1;
+        cpupool_cpu_moving = NULL;
+    }
+    cpu_set(cpu, c->cpu_valid);
+    return 0;
+}
+
+/*
+ * assign free physical cpus to a cpupool
+ * cpus assigned are unused cpus with lowest possible ids
+ * returns the number of cpus assigned
+ */
+int cpupool_assign_ncpu(struct cpupool *c, int ncpu)
+{
+    int i;
+    int n;
+
+    n = 0;
+    spin_lock(&cpupool_lock);
+    for_each_cpu_mask(i, cpupool_free_cpus)
+    {
+        if ( cpupool_assign_cpu_locked(c, i) == 0 )
+            n++;
+        if ( n == ncpu )
+            break;
+    }
+    spin_unlock(&cpupool_lock);
+    printk(XENLOG_DEBUG "cpupool_assign_ncpu(pool=%d,ncpu=%d) rc %d\n",
+        c->cpupool_id, ncpu, n);
+    return n;
+}
+
+static long cpupool_unassign_cpu_helper(void *info)
+{
+    struct cpupool *c = (struct cpupool *)info;
+    int cpu = cpupool_moving_cpu;
+    long ret;
+    int cpupool_id = c->cpupool_id;
+
+    ret = cpu_disable_scheduler(cpu, 1);
+    cpu_set(cpu, cpupool_free_cpus);
+    if ( !ret )
+    {
+        schedule_cpu_switch(cpu, NULL);
+        per_cpu(cpupool, cpu) = NULL;
+        cpupool_moving_cpu = -1;
+        cpupool_cpu_moving = NULL;
+    }
+    spin_unlock(&cpupool_lock);
+    printk(XENLOG_DEBUG "cpupool_unassign_cpu(pool=%d,cpu=%d) ret %ld\n",
+        cpupool_id, cpu, ret);
+    return ret;
+}
+
+/*
+ * unassign a specific cpu from a cpupool
+ * we must be sure not to run on the cpu to be unassigned! to achieve this
+ * the main functionality is performed via continue_hypercall_on_cpu on a
+ * specific cpu.
+ * if the cpu to be removed is the last one of the cpupool no active domain
+ * must be bound to the cpupool. dying domains are moved to cpupool0 as they
+ * might be zombies.
+ * possible failures:
+ * - last cpu and still active domains in cpupool
+ */
+int cpupool_unassign_cpu(struct cpupool *c, unsigned int cpu)
+{
+    int work_cpu;
+    int ret;
+    struct domain *d;
+    int cpupool_id = c->cpupool_id;
+
+    printk(XENLOG_DEBUG "cpupool_unassign_cpu(pool=%d,cpu=%d)\n",
+        cpupool_id, cpu);
+    spin_lock(&cpupool_lock);
+    ret = -EBUSY;
+    if ( (cpupool_moving_cpu != -1) && (cpu != cpupool_moving_cpu) )
+        goto out;
+
+    ret = 0;
+    if ( !cpu_isset(cpu, c->cpu_valid) && (cpu != cpupool_moving_cpu) )
+        goto out;
+
+    if ( (c->n_dom > 0) && (cpus_weight(c->cpu_valid) == 1) &&
+         (cpu != cpupool_moving_cpu) )
+    {
+        for_each_domain(d)
+        {
+            if ( d->cpupool != c )
+                continue;
+            if ( !d->is_dying )
+            {
+                ret = -EBUSY;
+                break;
+            }
+            c->n_dom--;
+            ret = sched_move_domain(d, cpupool0);
+            if ( ret )
+            {
+                c->n_dom++;
+                break;
+            }
+            cpupool0->n_dom++;
+        }
+        if ( ret )
+            goto out;
+    }
+    cpupool_moving_cpu = cpu;
+    cpupool_cpu_moving = c;
+    cpu_clear(cpu, c->cpu_valid);
+    work_cpu = smp_processor_id();
+    if ( work_cpu == cpu )
+    {
+        work_cpu = first_cpu(cpupool0->cpu_valid);
+        if ( work_cpu == cpu )
+            work_cpu = next_cpu(cpu, cpupool0->cpu_valid);
+    }
+    return continue_hypercall_on_cpu(work_cpu, cpupool_unassign_cpu_helper, c);
+
+out:
+    spin_unlock(&cpupool_lock);
+    printk(XENLOG_DEBUG "cpupool_unassign_cpu(pool=%d,cpu=%d) ret %d\n",
+        cpupool_id, cpu, ret);
+    return ret;
+}
+
+/*
+ * assign cpus to the default cpupool
+ * default are all cpus, less cpus may be specified as boot parameter
+ * possible failures:
+ * - no cpu assigned
+ */
+int __init cpupool0_cpu_assign(struct cpupool *c)
+{
+    if ( (cpupool0_max_cpus == 0) || (cpupool0_max_cpus > num_online_cpus()) )
+        cpupool0_max_cpus = num_online_cpus();
+    if ( !cpupool_assign_ncpu(cpupool0, cpupool0_max_cpus) )
+        return 1;
+    return 0;
+}
+
+/*
+ * add a new domain to a cpupool
+ * possible failures:
+ * - pool does not exist
+ * - no cpu assigned to pool
+ */
+int cpupool_add_domain(struct domain *d, int poolid)
+{
+    struct cpupool *c;
+    int rc = 1;
+    int n_dom;
+
+    if ( poolid == CPUPOOLID_NONE )
+        return 0;
+    spin_lock(&cpupool_lock);
+    c = cpupool_find_by_id(poolid, 1);
+    if ( (c != NULL) && cpus_weight(c->cpu_valid) )
+    {
+        c->n_dom++;
+        n_dom = c->n_dom;
+        d->cpupool = c;
+        rc = 0;
+    }
+    spin_unlock(&cpupool_lock);
+    if (!rc)
+        printk(XENLOG_DEBUG "cpupool_add_domain(dom=%d,pool=%d) n_dom %d\n",
+            d->domain_id, poolid, n_dom);
+    return rc;
+}
+
+/*
+ * remove a domain from a cpupool
+ */
+void cpupool_rm_domain(struct domain *d)
+{
+    int cpupool_id;
+    int n_dom;
+
+    if ( d->cpupool == NULL )
+        return;
+    spin_lock(&cpupool_lock);
+    cpupool_id = d->cpupool->cpupool_id;
+    d->cpupool->n_dom--;
+    n_dom = d->cpupool->n_dom;
+    d->cpupool = NULL;
+    spin_unlock(&cpupool_lock);
+    printk(XENLOG_DEBUG "cpupool_rm_domain(dom=%d,pool=%d) n_dom %d\n",
+        d->domain_id, cpupool_id, n_dom);
+    return;
+}
+
+/*
+ * called to add a new cpu to pool admin
+ * we add a hotplugged cpu to the cpupool0 to be able to add it to dom0
+ */
+void cpupool_cpu_add(unsigned int cpu)
+{
+    if ( cpupool0 == NULL )
+        return;
+    spin_lock(&cpupool_lock);
+    cpu_set(cpu, cpupool_free_cpus);
+    cpupool_assign_cpu_locked(cpupool0, cpu);
+    spin_unlock(&cpupool_lock);
+    return;
+}
+
+/*
+ * do cpupool related domctl operations
+ */
+int cpupool_do_domctl(struct xen_domctl_cpupool_op *op)
+{
+    int ret;
+    struct cpupool *c;
+
+    switch ( op->op )
+    {
+
+    case XEN_DOMCTL_CPUPOOL_OP_CREATE:
+    {
+        int poolid;
+        const struct scheduler *sched;
+
+        poolid = (op->cpupool_id == XEN_DOMCTL_CPUPOOL_PAR_ANY) ?
+            CPUPOOLID_NONE: op->cpupool_id;
+        sched = scheduler_get_by_id(op->sched_id);
+        ret = -ENOENT;
+        if ( sched == NULL )
+            break;
+        ret = 0;
+        c = cpupool_create(poolid, sched->opt_name);
+        if ( c == NULL )
+            ret = -EINVAL;
+        else
+            op->cpupool_id = c->cpupool_id;
+    }
+    break;
+
+    case XEN_DOMCTL_CPUPOOL_OP_DESTROY:
+    {
+        spin_lock(&cpupool_lock);
+        c = cpupool_find_by_id(op->cpupool_id, 1);
+        spin_unlock(&cpupool_lock);
+        ret = -ENOENT;
+        if ( c == NULL )
+            break;
+        ret = (cpupool_destroy(c) != 0) ? -EBUSY : 0;
+    }
+    break;
+
+    case XEN_DOMCTL_CPUPOOL_OP_INFO:
+    {
+        spin_lock(&cpupool_lock);
+        c = cpupool_find_by_id(op->cpupool_id, 0);
+        spin_unlock(&cpupool_lock);
+        ret = -ENOENT;
+        if ( c == NULL )
+            break;
+        op->cpupool_id = c->cpupool_id;
+        op->sched_id = c->sched.sched_id;
+        op->n_dom = c->n_dom;
+        cpumask_to_xenctl_cpumap(&(op->cpumap), &(c->cpu_valid));
+        ret = 0;
+    }
+    break;
+
+    case XEN_DOMCTL_CPUPOOL_OP_ADDCPU:
+    {
+        unsigned cpu;
+
+        cpu = op->cpu;
+        printk(XENLOG_DEBUG "cpupool_assign_cpu(pool=%d,cpu=%d)\n",
+            op->cpupool_id, cpu);
+        spin_lock(&cpupool_lock);
+        if ( cpu == XEN_DOMCTL_CPUPOOL_PAR_ANY )
+            cpu = first_cpu(cpupool_free_cpus);
+        ret = -EINVAL;
+        if ( cpu >= NR_CPUS )
+            goto addcpu_out;
+        ret = -EBUSY;
+        if ( !cpu_isset(cpu, cpupool_free_cpus) )
+            goto addcpu_out;
+        c = cpupool_find_by_id(op->cpupool_id, 0);
+        ret = -ENOENT;
+        if ( c == NULL )
+            goto addcpu_out;
+        ret = cpupool_assign_cpu_locked(c, cpu);
+addcpu_out:
+        spin_unlock(&cpupool_lock);
+        printk(XENLOG_DEBUG "cpupool_assign_cpu(pool=%d,cpu=%d) ret %d\n",
+            op->cpupool_id, cpu, ret);
+    }
+    break;
+
+    case XEN_DOMCTL_CPUPOOL_OP_RMCPU:
+    {
+        unsigned cpu;
+
+        spin_lock(&cpupool_lock);
+        c = cpupool_find_by_id(op->cpupool_id, 0);
+        spin_unlock(&cpupool_lock);
+        ret = -ENOENT;
+        if ( c == NULL )
+            break;
+        cpu = op->cpu;
+        if ( cpu == XEN_DOMCTL_CPUPOOL_PAR_ANY )
+            cpu = last_cpu(c->cpu_valid);
+        ret = -EINVAL;
+        if ( cpu >= NR_CPUS )
+            break;
+        /* caution: cpupool_unassign_cpu uses continue_hypercall_on_cpu and
+         * will continue after the local return
+         */
+        ret = cpupool_unassign_cpu(c, cpu);
+    }
+    break;
+
+    case XEN_DOMCTL_CPUPOOL_OP_MOVEDOMAIN:
+    {
+        struct domain *d;
+
+        ret = -EINVAL;
+        if ( op->domid == 0 )
+            break;
+        ret = -ESRCH;
+        d = rcu_lock_domain_by_id(op->domid);
+        if ( d == NULL )
+            break;
+        if ( d->cpupool == NULL )
+        {
+            ret = -EINVAL;
+            rcu_unlock_domain(d);
+            break;
+        }
+        printk(XENLOG_DEBUG "cpupool move_domain(dom=%d)->pool=%d\n",
+            d->domain_id, op->cpupool_id);
+        ret = -ENOENT;
+        spin_lock(&cpupool_lock);
+        c = cpupool_find_by_id(op->cpupool_id, 1);
+        if ( (c != NULL) && cpus_weight(c->cpu_valid) )
+        {
+            d->cpupool->n_dom--;
+            ret = sched_move_domain(d, c);
+            if ( ret )
+                d->cpupool->n_dom++;
+            else
+                c->n_dom++;
+        }
+        spin_unlock(&cpupool_lock);
+        printk(XENLOG_DEBUG "cpupool move_domain(dom=%d)->pool=%d ret %d\n",
+            d->domain_id, op->cpupool_id, ret);
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_CPUPOOL_OP_FREEINFO:
+    {
+        cpumask_to_xenctl_cpumap(&(op->cpumap),
+            &cpupool_free_cpus);
+        ret = 0;
+    }
+    break;
+
+    default:
+        ret = -ENOSYS;
+
+    }
+
+    return ret;
+}
+
+void schedule_dump(struct cpupool *c);
+
+void dump_runq(unsigned char key)
+{
+    unsigned long    flags;
+    s_time_t         now = NOW();
+    struct cpupool **c;
+
+    spin_lock(&cpupool_lock);
+    local_irq_save(flags);
+
+    printk("sched_smt_power_savings: %s\n",
+            sched_smt_power_savings? "enabled":"disabled");
+    printk("NOW=0x%08X%08X\n",  (u32)(now>>32), (u32)now);
+
+    printk("Idle cpupool:\n");
+    schedule_dump(NULL);
+
+    for_each_cpupool(c)
+    {
+        printk("Cpupool %d:\n", (*c)->cpupool_id);
+        schedule_dump(*c);
+    }
+
+    local_irq_restore(flags);
+    spin_unlock(&cpupool_lock);
+}
+
+static int __init cpupool_init(void)
+{
+    cpupool_free_cpus = cpu_online_map;
+    cpupool_list = NULL;
+    return 0;
+}
+__initcall(cpupool_init);
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r fadf63ab49e7 xen/common/domain.c
--- a/xen/common/domain.c       Mon Apr 19 17:57:28 2010 +0100
+++ b/xen/common/domain.c       Tue Apr 20 11:10:40 2010 +0200
@@ -150,6 +150,8 @@ struct vcpu *alloc_vcpu(
 
     tasklet_init(&v->continue_hypercall_tasklet, NULL, 0);
 
+    tasklet_init(&v->continue_hypercall_tasklet, NULL, 0);
+
     if ( is_idle_domain(d) )
     {
         v->runstate.state = RUNSTATE_running;
@@ -212,7 +214,7 @@ custom_param("extra_guest_irqs", parse_e
 custom_param("extra_guest_irqs", parse_extra_guest_irqs);
 
 struct domain *domain_create(
-    domid_t domid, unsigned int domcr_flags, ssidref_t ssidref)
+    domid_t domid, int poolid, unsigned int domcr_flags, ssidref_t ssidref)
 {
     struct domain *d, **pd;
     enum { INIT_xsm = 1u<<0, INIT_rangeset = 1u<<1, INIT_evtchn = 1u<<2,
@@ -291,6 +293,9 @@ struct domain *domain_create(
     d->iomem_caps = rangeset_new(d, "I/O Memory", RANGESETF_prettyprint_hex);
     d->irq_caps   = rangeset_new(d, "Interrupts", 0);
     if ( (d->iomem_caps == NULL) || (d->irq_caps == NULL) )
+        goto fail;
+
+    if ( cpupool_add_domain(d, poolid) != 0 )
         goto fail;
 
     if ( sched_init_domain(d) != 0 )
@@ -600,6 +605,8 @@ static void complete_domain_destroy(stru
     arch_domain_destroy(d);
 
     rangeset_domain_destroy(d);
+
+    cpupool_rm_domain(d);
 
     sched_destroy_domain(d);
 
diff -r fadf63ab49e7 xen/common/domctl.c
--- a/xen/common/domctl.c       Mon Apr 19 17:57:28 2010 +0100
+++ b/xen/common/domctl.c       Tue Apr 20 11:10:40 2010 +0200
@@ -11,6 +11,7 @@
 #include <xen/lib.h>
 #include <xen/mm.h>
 #include <xen/sched.h>
+#include <xen/sched-if.h>
 #include <xen/domain.h>
 #include <xen/event.h>
 #include <xen/domain_page.h>
@@ -140,10 +141,12 @@ void getdomaininfo(struct domain *d, str
     info->shared_info_frame = mfn_to_gmfn(d, __pa(d->shared_info)>>PAGE_SHIFT);
     BUG_ON(SHARED_M2P(info->shared_info_frame));
 
+    info->cpupool = d->cpupool ? d->cpupool->cpupool_id : CPUPOOLID_NONE;
+
     memcpy(info->handle, d->handle, sizeof(xen_domain_handle_t));
 }
 
-static unsigned int default_vcpu0_location(void)
+static unsigned int default_vcpu0_location(cpumask_t *online)
 {
     struct domain *d;
     struct vcpu   *v;
@@ -173,7 +176,7 @@ static unsigned int default_vcpu0_locati
     if ( cpus_weight(per_cpu(cpu_sibling_map, 0)) > 1 )
         cpu = next_cpu(cpu, per_cpu(cpu_sibling_map, 0));
     cpu_exclude_map = per_cpu(cpu_sibling_map, 0);
-    for_each_online_cpu ( i )
+    for_each_cpu_mask(i, *online)
     {
         if ( cpu_isset(i, cpu_exclude_map) )
             continue;
@@ -388,12 +391,14 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
         domid_t        dom;
         static domid_t rover = 0;
         unsigned int domcr_flags;
+        int            pool = 0;
 
         ret = -EINVAL;
         if ( supervisor_mode_kernel ||
              (op->u.createdomain.flags &
              ~(XEN_DOMCTL_CDF_hvm_guest | XEN_DOMCTL_CDF_hap |
-               XEN_DOMCTL_CDF_s3_integrity | XEN_DOMCTL_CDF_oos_off)) )
+               XEN_DOMCTL_CDF_s3_integrity | XEN_DOMCTL_CDF_oos_off |
+               XEN_DOMCTL_CDF_pool)) )
             break;
 
         dom = op->domain;
@@ -429,9 +434,15 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
             domcr_flags |= DOMCRF_s3_integrity;
         if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_oos_off )
             domcr_flags |= DOMCRF_oos_off;
+        if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_pool )
+            pool = op->u.createdomain.cpupool;
+
+        ret = -EINVAL;
+        if ( pool == CPUPOOLID_NONE )
+            break;
 
         ret = -ENOMEM;
-        d = domain_create(dom, domcr_flags, op->u.createdomain.ssidref);
+        d = domain_create(dom, pool, domcr_flags, op->u.createdomain.ssidref);
         if ( d == NULL )
             break;
 
@@ -450,6 +461,7 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
     {
         struct domain *d;
         unsigned int i, max = op->u.max_vcpus.max, cpu;
+        cpumask_t *online;
 
         ret = -ESRCH;
         if ( (d = rcu_lock_domain_by_id(op->domain)) == NULL )
@@ -498,6 +510,7 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
             goto maxvcpu_out;
 
         ret = -ENOMEM;
+        online = (d->cpupool == NULL) ? &cpu_online_map : 
&d->cpupool->cpu_valid;
         if ( max > d->max_vcpus )
         {
             struct vcpu **vcpus;
@@ -521,8 +534,8 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
                 continue;
 
             cpu = (i == 0) ?
-                default_vcpu0_location() :
-                cycle_cpu(d->vcpu[i-1]->processor, cpu_online_map);
+                default_vcpu0_location(online) :
+                cycle_cpu(d->vcpu[i-1]->processor, *online);
 
             if ( alloc_vcpu(d, i, cpu) == NULL )
                 goto maxvcpu_out;
@@ -961,6 +974,14 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
     }
     break;
 
+    case XEN_DOMCTL_cpupool_op:
+    {
+        ret = cpupool_do_domctl(&op->u.cpupool_op);
+        if ( (ret == 0) && copy_to_guest(u_domctl, op, 1) )
+            ret = -EFAULT;
+    }
+    break;
+
     default:
         ret = arch_do_domctl(op, u_domctl);
         break;
diff -r fadf63ab49e7 xen/common/sched_credit.c
--- a/xen/common/sched_credit.c Mon Apr 19 17:57:28 2010 +0100
+++ b/xen/common/sched_credit.c Tue Apr 20 11:10:40 2010 +0200
@@ -70,11 +70,15 @@
 /*
  * Useful macros
  */
+#define CSCHED_PRIV(_ops)   \
+    ((struct csched_private *)((_ops)->sched_data))
 #define CSCHED_PCPU(_c)     \
     ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv)
 #define CSCHED_VCPU(_vcpu)  ((struct csched_vcpu *) (_vcpu)->sched_priv)
 #define CSCHED_DOM(_dom)    ((struct csched_dom *) (_dom)->sched_priv)
 #define RUNQ(_cpu)          (&(CSCHED_PCPU(_cpu)->runq))
+#define CSCHED_CPUONLINE(_pool)    \
+    (((_pool) == NULL) ? &cpupool_free_cpus : &(_pool)->cpu_valid)
 
 
 /*
@@ -160,19 +164,22 @@ struct csched_private {
     struct timer  master_ticker;
     unsigned int master;
     cpumask_t idlers;
+    cpumask_t cpus;
     uint32_t weight;
     uint32_t credit;
     int credit_balance;
     uint32_t runq_sort;
+    int ticker_active;
 };
 
 
 /*
  * Global variables
  */
-static struct csched_private csched_priv;
+static struct csched_private *csched_priv0 = NULL;
 
 static void csched_tick(void *_cpu);
+static void csched_acct(void *dummy);
 
 static inline int
 __vcpu_on_runq(struct csched_vcpu *svc)
@@ -238,6 +245,7 @@ __runq_tickle(unsigned int cpu, struct c
 {
     struct csched_vcpu * const cur =
         CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
+    struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu));
     cpumask_t mask;
 
     ASSERT(cur);
@@ -264,7 +272,7 @@ __runq_tickle(unsigned int cpu, struct c
      */
     if ( cur->pri > CSCHED_PRI_IDLE )
     {
-        if ( cpus_empty(csched_priv.idlers) )
+        if ( cpus_empty(prv->idlers) )
         {
             CSCHED_STAT_CRANK(tickle_idlers_none);
         }
@@ -272,7 +280,7 @@ __runq_tickle(unsigned int cpu, struct c
         {
             cpumask_t idle_mask;
 
-            cpus_and(idle_mask, csched_priv.idlers, new->vcpu->cpu_affinity);
+            cpus_and(idle_mask, prv->idlers, new->vcpu->cpu_affinity);
             if ( !cpus_empty(idle_mask) )
             {
                 CSCHED_STAT_CRANK(tickle_idlers_some);
@@ -294,40 +302,80 @@ __runq_tickle(unsigned int cpu, struct c
         cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
 }
 
-static int
-csched_pcpu_init(int cpu)
+static void
+csched_free_pdata(struct scheduler *ops, void *pcpu, int cpu)
+{
+    struct csched_private *prv = CSCHED_PRIV(ops);
+    struct csched_pcpu *spc = pcpu;
+    unsigned long flags;
+
+    if ( spc == NULL )
+        return;
+
+    spin_lock_irqsave(&prv->lock, flags);
+
+    prv->credit -= CSCHED_CREDITS_PER_ACCT;
+    prv->ncpus--;
+    cpu_clear(cpu, prv->idlers);
+    cpu_clear(cpu, prv->cpus);
+    if ( (prv->master == cpu) && (prv->ncpus > 0) )
+    {
+        prv->master = first_cpu(prv->cpus);
+        migrate_timer(&prv->master_ticker, prv->master);
+    }
+    kill_timer(&spc->ticker);
+    if ( prv->ncpus == 0 )
+        kill_timer(&prv->master_ticker);
+
+    spin_unlock_irqrestore(&prv->lock, flags);
+
+    xfree(spc);
+}
+
+static void *
+csched_alloc_pdata(struct scheduler *ops, int cpu)
 {
     struct csched_pcpu *spc;
+    struct csched_private *prv = CSCHED_PRIV(ops);
     unsigned long flags;
 
     /* Allocate per-PCPU info */
     spc = xmalloc(struct csched_pcpu);
     if ( spc == NULL )
-        return -1;
+        return NULL;
     memset(spc, 0, sizeof(*spc));
 
-    spin_lock_irqsave(&csched_priv.lock, flags);
+    spin_lock_irqsave(&prv->lock, flags);
 
     /* Initialize/update system-wide config */
-    csched_priv.credit += CSCHED_CREDITS_PER_ACCT;
-    if ( csched_priv.ncpus <= cpu )
-        csched_priv.ncpus = cpu + 1;
-    if ( csched_priv.master >= csched_priv.ncpus )
-        csched_priv.master = cpu;
+    prv->credit += CSCHED_CREDITS_PER_ACCT;
+    prv->ncpus++;
+    cpu_set(cpu, prv->cpus);
+    if ( (prv->ncpus == 1) && (prv != csched_priv0) )
+    {
+        prv->master = cpu;
+        init_timer( &prv->master_ticker, csched_acct, prv, cpu);
+        prv->ticker_active = 2;
+    }
 
     init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu);
+
+    if ( prv == csched_priv0 )
+        prv->master = first_cpu(prv->cpus);
+
     INIT_LIST_HEAD(&spc->runq);
-    spc->runq_sort_last = csched_priv.runq_sort;
+    spc->runq_sort_last = prv->runq_sort;
     spc->idle_bias = NR_CPUS - 1;
-    per_cpu(schedule_data, cpu).sched_priv = spc;
+    if ( per_cpu(schedule_data, cpu).sched_priv == NULL )
+        per_cpu(schedule_data, cpu).sched_priv = spc;
 
     /* Start off idling... */
     BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr));
-    cpu_set(cpu, csched_priv.idlers);
+    cpu_set(cpu, prv->idlers);
 
-    spin_unlock_irqrestore(&csched_priv.lock, flags);
+    spin_unlock_irqrestore(&prv->lock, flags);
 
-    return 0;
+    return spc;
 }
 
 #ifndef NDEBUG
@@ -400,17 +448,19 @@ __csched_vcpu_is_migrateable(struct vcpu
 }
 
 static int
-_csched_cpu_pick(struct vcpu *vc, bool_t commit)
+_csched_cpu_pick(struct scheduler *ops, struct vcpu *vc, bool_t commit)
 {
     cpumask_t cpus;
     cpumask_t idlers;
+    cpumask_t *online;
     int cpu;
 
     /*
      * Pick from online CPUs in VCPU's affinity mask, giving a
      * preference to its current processor if it's in there.
      */
-    cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
+    online = CSCHED_CPUONLINE(vc->domain->cpupool);
+    cpus_and(cpus, *online, vc->cpu_affinity);
     cpu = cpu_isset(vc->processor, cpus)
             ? vc->processor
             : cycle_cpu(vc->processor, cpus);
@@ -428,7 +478,7 @@ _csched_cpu_pick(struct vcpu *vc, bool_t
      * like run two VCPUs on co-hyperthreads while there are idle cores
      * or sockets.
      */
-    cpus_and(idlers, cpu_online_map, csched_priv.idlers);
+    cpus_and(idlers, cpu_online_map, CSCHED_PRIV(ops)->idlers);
     cpu_set(cpu, idlers);
     cpus_and(cpus, cpus, idlers);
     cpu_clear(cpu, cpus);
@@ -474,18 +524,18 @@ _csched_cpu_pick(struct vcpu *vc, bool_t
 }
 
 static int
-csched_cpu_pick(struct vcpu *vc)
+csched_cpu_pick(struct scheduler *ops, struct vcpu *vc)
 {
-    return _csched_cpu_pick(vc, 1);
+    return _csched_cpu_pick(ops, vc, 1);
 }
 
 static inline void
-__csched_vcpu_acct_start(struct csched_vcpu *svc)
+__csched_vcpu_acct_start(struct csched_private *prv, struct csched_vcpu *svc)
 {
     struct csched_dom * const sdom = svc->sdom;
     unsigned long flags;
 
-    spin_lock_irqsave(&csched_priv.lock, flags);
+    spin_lock_irqsave(&prv->lock, flags);
 
     if ( list_empty(&svc->active_vcpu_elem) )
     {
@@ -496,16 +546,17 @@ __csched_vcpu_acct_start(struct csched_v
         list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
         if ( list_empty(&sdom->active_sdom_elem) )
         {
-            list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
-            csched_priv.weight += sdom->weight;
+            list_add(&sdom->active_sdom_elem, &prv->active_sdom);
+            prv->weight += sdom->weight;
         }
     }
 
-    spin_unlock_irqrestore(&csched_priv.lock, flags);
+    spin_unlock_irqrestore(&prv->lock, flags);
 }
 
 static inline void
-__csched_vcpu_acct_stop_locked(struct csched_vcpu *svc)
+__csched_vcpu_acct_stop_locked(struct csched_private *prv,
+    struct csched_vcpu *svc)
 {
     struct csched_dom * const sdom = svc->sdom;
 
@@ -518,16 +569,17 @@ __csched_vcpu_acct_stop_locked(struct cs
     list_del_init(&svc->active_vcpu_elem);
     if ( list_empty(&sdom->active_vcpu) )
     {
-        BUG_ON( csched_priv.weight < sdom->weight );
+        BUG_ON( prv->weight < sdom->weight );
         list_del_init(&sdom->active_sdom_elem);
-        csched_priv.weight -= sdom->weight;
+        prv->weight -= sdom->weight;
     }
 }
 
 static void
-csched_vcpu_acct(unsigned int cpu)
+csched_vcpu_acct(struct csched_private *prv, unsigned int cpu)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(current);
+    struct scheduler *ops = per_cpu(scheduler, cpu);
 
     ASSERT( current->processor == cpu );
     ASSERT( svc->sdom != NULL );
@@ -556,9 +608,9 @@ csched_vcpu_acct(unsigned int cpu)
      */
     if ( list_empty(&svc->active_vcpu_elem) )
     {
-        __csched_vcpu_acct_start(svc);
+        __csched_vcpu_acct_start(prv, svc);
     }
-    else if ( _csched_cpu_pick(current, 0) != cpu )
+    else if ( _csched_cpu_pick(ops, current, 0) != cpu )
     {
         CSCHED_VCPU_STAT_CRANK(svc, migrate_r);
         CSCHED_STAT_CRANK(migrate_running);
@@ -567,66 +619,75 @@ csched_vcpu_acct(unsigned int cpu)
     }
 }
 
-static int
-csched_vcpu_init(struct vcpu *vc)
+static void *
+csched_alloc_vdata(struct scheduler *ops, struct vcpu *vc, void *dd)
 {
-    struct domain * const dom = vc->domain;
-    struct csched_dom *sdom = CSCHED_DOM(dom);
     struct csched_vcpu *svc;
-
-    CSCHED_STAT_CRANK(vcpu_init);
 
     /* Allocate per-VCPU info */
     svc = xmalloc(struct csched_vcpu);
     if ( svc == NULL )
-        return -1;
+        return NULL;
     memset(svc, 0, sizeof(*svc));
 
     INIT_LIST_HEAD(&svc->runq_elem);
     INIT_LIST_HEAD(&svc->active_vcpu_elem);
-    svc->sdom = sdom;
+    svc->sdom = dd;
     svc->vcpu = vc;
     atomic_set(&svc->credit, 0);
     svc->flags = 0U;
-    svc->pri = is_idle_domain(dom) ? CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
+    svc->pri = is_idle_domain(vc->domain) ?
+        CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
     CSCHED_VCPU_STATS_RESET(svc);
-    vc->sched_priv = svc;
-
-    /* Allocate per-PCPU info */
-    if ( unlikely(!CSCHED_PCPU(vc->processor)) )
-    {
-        if ( csched_pcpu_init(vc->processor) != 0 )
-            return -1;
-    }
-
-    CSCHED_VCPU_CHECK(vc);
-    return 0;
+    CSCHED_STAT_CRANK(vcpu_init);
+    return svc;
 }
 
 static void
-csched_vcpu_destroy(struct vcpu *vc)
+csched_vcpu_insert(struct scheduler *ops, struct vcpu *vc)
+{
+    struct csched_vcpu *svc = vc->sched_priv;
+
+    if ( !__vcpu_on_runq(svc) && vcpu_runnable(vc) && !vc->is_running )
+        __runq_insert(vc->processor, svc);
+}
+
+static void
+csched_free_vdata(struct scheduler *ops, void *priv)
+{
+    struct csched_private *prv = CSCHED_PRIV(ops);
+    struct csched_vcpu *svc = priv;
+    unsigned long flags;
+
+    if ( __vcpu_on_runq(svc) )
+        __runq_remove(svc);
+
+    spin_lock_irqsave(&(prv->lock), flags);
+
+    if ( !list_empty(&svc->active_vcpu_elem) )
+        __csched_vcpu_acct_stop_locked(prv, svc);
+
+    spin_unlock_irqrestore(&(prv->lock), flags);
+
+    xfree(svc);
+}
+
+static void
+csched_vcpu_destroy(struct scheduler *ops, struct vcpu *vc)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(vc);
     struct csched_dom * const sdom = svc->sdom;
-    unsigned long flags;
 
     CSCHED_STAT_CRANK(vcpu_destroy);
 
     BUG_ON( sdom == NULL );
     BUG_ON( !list_empty(&svc->runq_elem) );
 
-    spin_lock_irqsave(&csched_priv.lock, flags);
-
-    if ( !list_empty(&svc->active_vcpu_elem) )
-        __csched_vcpu_acct_stop_locked(svc);
-
-    spin_unlock_irqrestore(&csched_priv.lock, flags);
-
-    xfree(svc);
+    csched_free_vdata(ops, svc);
 }
 
 static void
-csched_vcpu_sleep(struct vcpu *vc)
+csched_vcpu_sleep(struct scheduler *ops, struct vcpu *vc)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(vc);
 
@@ -641,7 +702,7 @@ csched_vcpu_sleep(struct vcpu *vc)
 }
 
 static void
-csched_vcpu_wake(struct vcpu *vc)
+csched_vcpu_wake(struct scheduler *ops, struct vcpu *vc)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(vc);
     const unsigned int cpu = vc->processor;
@@ -697,10 +758,12 @@ csched_vcpu_wake(struct vcpu *vc)
 
 static int
 csched_dom_cntl(
+    struct scheduler *ops,
     struct domain *d,
     struct xen_domctl_scheduler_op *op)
 {
     struct csched_dom * const sdom = CSCHED_DOM(d);
+    struct csched_private *prv = CSCHED_PRIV(ops);
     unsigned long flags;
 
     if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
@@ -712,14 +775,14 @@ csched_dom_cntl(
     {
         ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo);
 
-        spin_lock_irqsave(&csched_priv.lock, flags);
+        spin_lock_irqsave(&prv->lock, flags);
 
         if ( op->u.credit.weight != 0 )
         {
             if ( !list_empty(&sdom->active_sdom_elem) )
             {
-                csched_priv.weight -= sdom->weight;
-                csched_priv.weight += op->u.credit.weight;
+                prv->weight -= sdom->weight;
+                prv->weight += op->u.credit.weight;
             }
             sdom->weight = op->u.credit.weight;
         }
@@ -727,25 +790,20 @@ csched_dom_cntl(
         if ( op->u.credit.cap != (uint16_t)~0U )
             sdom->cap = op->u.credit.cap;
 
-        spin_unlock_irqrestore(&csched_priv.lock, flags);
+        spin_unlock_irqrestore(&prv->lock, flags);
     }
 
     return 0;
 }
 
-static int
-csched_dom_init(struct domain *dom)
+static void *
+csched_alloc_domdata(struct scheduler *ops, struct domain *dom)
 {
     struct csched_dom *sdom;
 
-    CSCHED_STAT_CRANK(dom_init);
-
-    if ( is_idle_domain(dom) )
-        return 0;
-
     sdom = xmalloc(struct csched_dom);
     if ( sdom == NULL )
-        return -ENOMEM;
+        return NULL;
     memset(sdom, 0, sizeof(*sdom));
 
     /* Initialize credit and weight */
@@ -755,16 +813,40 @@ csched_dom_init(struct domain *dom)
     sdom->dom = dom;
     sdom->weight = CSCHED_DEFAULT_WEIGHT;
     sdom->cap = 0U;
+
+    return (void *)sdom;
+}
+
+static int
+csched_dom_init(struct scheduler *ops, struct domain *dom)
+{
+    struct csched_dom *sdom;
+
+    CSCHED_STAT_CRANK(dom_init);
+
+    if ( is_idle_domain(dom) )
+        return 0;
+
+    sdom = csched_alloc_domdata(ops, dom);
+    if ( sdom == NULL )
+        return -ENOMEM;
+
     dom->sched_priv = sdom;
 
     return 0;
 }
 
 static void
-csched_dom_destroy(struct domain *dom)
+csched_free_domdata(struct scheduler *ops, void *data)
+{
+    xfree(data);
+}
+
+static void
+csched_dom_destroy(struct scheduler *ops, struct domain *dom)
 {
     CSCHED_STAT_CRANK(dom_destroy);
-    xfree(CSCHED_DOM(dom));
+    csched_free_domdata(ops, CSCHED_DOM(dom));
 }
 
 /*
@@ -775,7 +857,7 @@ csched_dom_destroy(struct domain *dom)
  * remember the last UNDER to make the move up operation O(1).
  */
 static void
-csched_runq_sort(unsigned int cpu)
+csched_runq_sort(struct csched_private *prv, unsigned int cpu)
 {
     struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
     struct list_head *runq, *elem, *next, *last_under;
@@ -783,7 +865,7 @@ csched_runq_sort(unsigned int cpu)
     unsigned long flags;
     int sort_epoch;
 
-    sort_epoch = csched_priv.runq_sort;
+    sort_epoch = prv->runq_sort;
     if ( sort_epoch == spc->runq_sort_last )
         return;
 
@@ -820,6 +902,7 @@ static void
 static void
 csched_acct(void* dummy)
 {
+    struct csched_private *prv = dummy;
     unsigned long flags;
     struct list_head *iter_vcpu, *next_vcpu;
     struct list_head *iter_sdom, *next_sdom;
@@ -836,22 +919,22 @@ csched_acct(void* dummy)
     int credit;
 
 
-    spin_lock_irqsave(&csched_priv.lock, flags);
+    spin_lock_irqsave(&prv->lock, flags);
 
-    weight_total = csched_priv.weight;
-    credit_total = csched_priv.credit;
+    weight_total = prv->weight;
+    credit_total = prv->credit;
 
     /* Converge balance towards 0 when it drops negative */
-    if ( csched_priv.credit_balance < 0 )
+    if ( prv->credit_balance < 0 )
     {
-        credit_total -= csched_priv.credit_balance;
+        credit_total -= prv->credit_balance;
         CSCHED_STAT_CRANK(acct_balance);
     }
 
     if ( unlikely(weight_total == 0) )
     {
-        csched_priv.credit_balance = 0;
-        spin_unlock_irqrestore(&csched_priv.lock, flags);
+        prv->credit_balance = 0;
+        spin_unlock_irqrestore(&prv->lock, flags);
         CSCHED_STAT_CRANK(acct_no_work);
         goto out;
     }
@@ -863,7 +946,7 @@ csched_acct(void* dummy)
     credit_xtra = 0;
     credit_cap = 0U;
 
-    list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom )
+    list_for_each_safe( iter_sdom, next_sdom, &prv->active_sdom )
     {
         sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
 
@@ -883,9 +966,9 @@ csched_acct(void* dummy)
          * only when the system-wide credit balance is negative.
          */
         credit_peak = sdom->active_vcpu_count * CSCHED_CREDITS_PER_ACCT;
-        if ( csched_priv.credit_balance < 0 )
+        if ( prv->credit_balance < 0 )
         {
-            credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) +
+            credit_peak += ( ( -prv->credit_balance * sdom->weight) +
                              (weight_total - 1)
                            ) / weight_total;
         }
@@ -927,7 +1010,7 @@ csched_acct(void* dummy)
                  */
                 CSCHED_STAT_CRANK(acct_reorder);
                 list_del(&sdom->active_sdom_elem);
-                list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
+                list_add(&sdom->active_sdom_elem, &prv->active_sdom);
             }
 
             credit_fair = credit_peak;
@@ -993,7 +1076,7 @@ csched_acct(void* dummy)
                 /* Upper bound on credits means VCPU stops earning */
                 if ( credit > CSCHED_CREDITS_PER_TSLICE )
                 {
-                    __csched_vcpu_acct_stop_locked(svc);
+                    __csched_vcpu_acct_stop_locked(prv, svc);
                     credit = 0;
                     atomic_set(&svc->credit, credit);
                 }
@@ -1005,15 +1088,15 @@ csched_acct(void* dummy)
         }
     }
 
-    csched_priv.credit_balance = credit_balance;
+    prv->credit_balance = credit_balance;
 
-    spin_unlock_irqrestore(&csched_priv.lock, flags);
+    spin_unlock_irqrestore(&prv->lock, flags);
 
     /* Inform each CPU that its runq needs to be sorted */
-    csched_priv.runq_sort++;
+    prv->runq_sort++;
 
 out:
-    set_timer( &csched_priv.master_ticker, NOW() +
+    set_timer( &prv->master_ticker, NOW() +
             MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT );
 }
 
@@ -1022,6 +1105,7 @@ csched_tick(void *_cpu)
 {
     unsigned int cpu = (unsigned long)_cpu;
     struct csched_pcpu *spc = CSCHED_PCPU(cpu);
+    struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu));
 
     spc->tick++;
 
@@ -1029,7 +1113,7 @@ csched_tick(void *_cpu)
      * Accounting for running VCPU
      */
     if ( !is_idle_vcpu(current) )
-        csched_vcpu_acct(cpu);
+        csched_vcpu_acct(prv, cpu);
 
     /*
      * Check if runq needs to be sorted
@@ -1038,7 +1122,7 @@ csched_tick(void *_cpu)
      * modified priorities. This is a special O(n) sort and runs at most
      * once per accounting period (currently 30 milliseconds).
      */
-    csched_runq_sort(cpu);
+    csched_runq_sort(prv, cpu);
 
     set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
 }
@@ -1091,16 +1175,19 @@ csched_runq_steal(int peer_cpu, int cpu,
 }
 
 static struct csched_vcpu *
-csched_load_balance(int cpu, struct csched_vcpu *snext)
+csched_load_balance(struct csched_private *prv, int cpu,
+    struct csched_vcpu *snext)
 {
     struct csched_vcpu *speer;
     cpumask_t workers;
+    cpumask_t *online;
     int peer_cpu;
 
     BUG_ON( cpu != snext->vcpu->processor );
+    online = CSCHED_CPUONLINE(per_cpu(cpupool, cpu));
 
     /* If this CPU is going offline we shouldn't steal work. */
-    if ( unlikely(!cpu_online(cpu)) )
+    if ( unlikely(!cpu_isset(cpu, *online)) )
         goto out;
 
     if ( snext->pri == CSCHED_PRI_IDLE )
@@ -1114,7 +1201,7 @@ csched_load_balance(int cpu, struct csch
      * Peek at non-idling CPUs in the system, starting with our
      * immediate neighbour.
      */
-    cpus_andnot(workers, cpu_online_map, csched_priv.idlers);
+    cpus_andnot(workers, *online, prv->idlers);
     cpu_clear(cpu, workers);
     peer_cpu = cpu;
 
@@ -1156,11 +1243,12 @@ csched_load_balance(int cpu, struct csch
  * fast for the common case.
  */
 static struct task_slice
-csched_schedule(s_time_t now)
+csched_schedule(struct scheduler *ops, s_time_t now)
 {
     const int cpu = smp_processor_id();
     struct list_head * const runq = RUNQ(cpu);
     struct csched_vcpu * const scurr = CSCHED_VCPU(current);
+    struct csched_private *prv = CSCHED_PRIV(ops);
     struct csched_vcpu *snext;
     struct task_slice ret;
 
@@ -1207,7 +1295,7 @@ csched_schedule(s_time_t now)
     if ( snext->pri > CSCHED_PRI_TS_OVER )
         __runq_remove(snext);
     else
-        snext = csched_load_balance(cpu, snext);
+        snext = csched_load_balance(prv, cpu, snext);
 
     /*
      * Update idlers mask if necessary. When we're idling, other CPUs
@@ -1215,12 +1303,12 @@ csched_schedule(s_time_t now)
      */
     if ( snext->pri == CSCHED_PRI_IDLE )
     {
-        if ( !cpu_isset(cpu, csched_priv.idlers) )
-            cpu_set(cpu, csched_priv.idlers);
+        if ( !cpu_isset(cpu, prv->idlers) )
+            cpu_set(cpu, prv->idlers);
     }
-    else if ( cpu_isset(cpu, csched_priv.idlers) )
+    else if ( cpu_isset(cpu, prv->idlers) )
     {
-        cpu_clear(cpu, csched_priv.idlers);
+        cpu_clear(cpu, prv->idlers);
     }
 
     if ( !is_idle_vcpu(snext->vcpu) )
@@ -1267,7 +1355,7 @@ csched_dump_vcpu(struct csched_vcpu *svc
 }
 
 static void
-csched_dump_pcpu(int cpu)
+csched_dump_pcpu(struct scheduler *ops, int cpu)
 {
     struct list_head *runq, *iter;
     struct csched_pcpu *spc;
@@ -1305,9 +1393,10 @@ csched_dump_pcpu(int cpu)
 }
 
 static void
-csched_dump(void)
+csched_dump(struct scheduler *ops)
 {
     struct list_head *iter_sdom, *iter_svc;
+    struct csched_private *prv = CSCHED_PRIV(ops);
     int loop;
 #define idlers_buf keyhandler_scratch
 
@@ -1324,12 +1413,12 @@ csched_dump(void)
            "\tticks per tslice   = %d\n"
            "\tticks per acct     = %d\n"
            "\tmigration delay    = %uus\n",
-           csched_priv.ncpus,
-           csched_priv.master,
-           csched_priv.credit,
-           csched_priv.credit_balance,
-           csched_priv.weight,
-           csched_priv.runq_sort,
+           prv->ncpus,
+           prv->master,
+           prv->credit,
+           prv->credit_balance,
+           prv->weight,
+           prv->runq_sort,
            CSCHED_DEFAULT_WEIGHT,
            CSCHED_MSECS_PER_TICK,
            CSCHED_CREDITS_PER_MSEC,
@@ -1337,12 +1426,12 @@ csched_dump(void)
            CSCHED_TICKS_PER_ACCT,
            vcpu_migration_delay);
 
-    cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), csched_priv.idlers);
+    cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), prv->idlers);
     printk("idlers: %s\n", idlers_buf);
 
     printk("active vcpus:\n");
     loop = 0;
-    list_for_each( iter_sdom, &csched_priv.active_sdom )
+    list_for_each( iter_sdom, &prv->active_sdom )
     {
         struct csched_dom *sdom;
         sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
@@ -1359,18 +1448,30 @@ csched_dump(void)
 #undef idlers_buf
 }
 
-static void
-csched_init(void)
+static int
+csched_init(struct scheduler *ops, int pool0)
 {
-    spin_lock_init(&csched_priv.lock);
-    INIT_LIST_HEAD(&csched_priv.active_sdom);
-    csched_priv.ncpus = 0;
-    csched_priv.master = UINT_MAX;
-    cpus_clear(csched_priv.idlers);
-    csched_priv.weight = 0U;
-    csched_priv.credit = 0U;
-    csched_priv.credit_balance = 0;
-    csched_priv.runq_sort = 0U;
+    struct csched_private *prv;
+
+    prv = xmalloc(struct csched_private);
+    if ( prv == NULL )
+        return 1;
+    memset(prv, 0, sizeof(*prv));
+    if ( pool0 )
+        csched_priv0 = prv;
+    ops->sched_data = prv;
+    spin_lock_init(&prv->lock);
+    INIT_LIST_HEAD(&prv->active_sdom);
+    prv->ncpus = 0;
+    prv->master = UINT_MAX;
+    cpus_clear(prv->idlers);
+    prv->weight = 0U;
+    prv->credit = 0U;
+    prv->credit_balance = 0;
+    prv->runq_sort = 0U;
+    prv->ticker_active = (csched_priv0 == prv) ? 0 : 1;
+
+    return 0;
 }
 
 /* Tickers cannot be kicked until SMP subsystem is alive. */
@@ -1380,8 +1481,10 @@ static __init int csched_start_tickers(v
     unsigned int cpu;
 
     /* Is the credit scheduler initialised? */
-    if ( csched_priv.ncpus == 0 )
+    if ( (csched_priv0 == NULL) || (csched_priv0->ncpus == 0) )
         return 0;
+
+    csched_priv0->ticker_active = 1;
 
     for_each_online_cpu ( cpu )
     {
@@ -1389,45 +1492,72 @@ static __init int csched_start_tickers(v
         set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
     }
 
-    init_timer( &csched_priv.master_ticker, csched_acct, NULL,
-                    csched_priv.master);
+    init_timer( &csched_priv0->master_ticker, csched_acct, csched_priv0,
+                    csched_priv0->master);
 
-    set_timer( &csched_priv.master_ticker, NOW() +
+    set_timer( &csched_priv0->master_ticker, NOW() +
             MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT );
 
     return 0;
 }
 __initcall(csched_start_tickers);
 
-static void csched_tick_suspend(void)
+static void
+csched_deinit(struct scheduler *ops)
+{
+    struct csched_private *prv;
+
+    prv = CSCHED_PRIV(ops);
+    if ( prv != NULL )
+        xfree(prv);
+}
+
+static void csched_tick_suspend(struct scheduler *ops, unsigned int cpu)
 {
     struct csched_pcpu *spc;
 
-    spc = CSCHED_PCPU(smp_processor_id());
+    spc = CSCHED_PCPU(cpu);
 
     stop_timer(&spc->ticker);
 }
 
-static void csched_tick_resume(void)
+static void csched_tick_resume(struct scheduler *ops, unsigned int cpu)
 {
     struct csched_pcpu *spc;
     uint64_t now = NOW();
+    struct csched_private *prv;
 
-    spc = CSCHED_PCPU(smp_processor_id());
+    prv = CSCHED_PRIV(ops);
+    if ( !prv->ticker_active )
+        return;
+
+
+    spc = CSCHED_PCPU(cpu);
 
     set_timer(&spc->ticker, now + MILLISECS(CSCHED_MSECS_PER_TICK)
             - now % MILLISECS(CSCHED_MSECS_PER_TICK) );
+
+    if ( (prv->ticker_active == 2) && (prv->master == cpu) )
+    {
+        set_timer( &prv->master_ticker, now +
+            MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT -
+            now % MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT);
+        prv->ticker_active = 1;
+    }
 }
 
-const struct scheduler sched_credit_def = {
+static struct csched_private _csched_priv;
+
+struct scheduler sched_credit_def = {
     .name           = "SMP Credit Scheduler",
     .opt_name       = "credit",
     .sched_id       = XEN_SCHEDULER_CREDIT,
+    .sched_data     = &_csched_priv,
 
     .init_domain    = csched_dom_init,
     .destroy_domain = csched_dom_destroy,
 
-    .init_vcpu      = csched_vcpu_init,
+    .insert_vcpu    = csched_vcpu_insert,
     .destroy_vcpu   = csched_vcpu_destroy,
 
     .sleep          = csched_vcpu_sleep,
@@ -1441,6 +1571,13 @@ const struct scheduler sched_credit_def 
     .dump_cpu_state = csched_dump_pcpu,
     .dump_settings  = csched_dump,
     .init           = csched_init,
+    .deinit         = csched_deinit,
+    .alloc_vdata    = csched_alloc_vdata,
+    .free_vdata     = csched_free_vdata,
+    .alloc_pdata    = csched_alloc_pdata,
+    .free_pdata     = csched_free_pdata,
+    .alloc_domdata  = csched_alloc_domdata,
+    .free_domdata   = csched_free_domdata,
 
     .tick_suspend   = csched_tick_suspend,
     .tick_resume    = csched_tick_resume,
diff -r fadf63ab49e7 xen/common/sched_credit2.c
--- a/xen/common/sched_credit2.c        Mon Apr 19 17:57:28 2010 +0100
+++ b/xen/common/sched_credit2.c        Tue Apr 20 11:10:40 2010 +0200
@@ -149,12 +149,16 @@
 /*
  * Useful macros
  */
+#define CSCHED_PRIV(_ops)   \
+    ((struct csched_private *)((_ops)->sched_data))
 #define CSCHED_VCPU(_vcpu)  ((struct csched_vcpu *) (_vcpu)->sched_priv)
 #define CSCHED_DOM(_dom)    ((struct csched_dom *) (_dom)->sched_priv)
+#define CSCHED_CPUONLINE(_pool)    \
+    (((_pool) == NULL) ? &cpupool_free_cpus : &(_pool)->cpu_valid)
 /* CPU to runq_id macro */
-#define c2r(_cpu)           (csched_priv.runq_map[(_cpu)])
+#define c2r(_ops, _cpu)     (CSCHED_PRIV(_ops)->runq_map[(_cpu)])
 /* CPU to runqueue struct macro */
-#define RQD(_cpu)          (&csched_priv.rqd[c2r(_cpu)])
+#define RQD(_ops, _cpu)     (&CSCHED_PRIV(_ops)->rqd[c2r(_ops, _cpu)])
 
 /*
  * Per-runqueue data
@@ -212,11 +216,6 @@ struct csched_dom {
     uint16_t nr_vcpus;
 };
 
-
-/*
- * Global variables
- */
-static struct csched_private csched_priv;
 
 /*
  * Time-to-credit, credit-to-time.
@@ -284,15 +283,15 @@ __runq_insert(struct list_head *runq, st
 }
 
 static void
-runq_insert(unsigned int cpu, struct csched_vcpu *svc)
+runq_insert(struct scheduler *ops, unsigned int cpu, struct csched_vcpu *svc)
 {
-    struct list_head * runq = &RQD(cpu)->runq;
+    struct list_head * runq = &RQD(ops, cpu)->runq;
     int pos = 0;
 
     ASSERT( spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock) );
 
     BUG_ON( __vcpu_on_runq(svc) );
-    BUG_ON( c2r(cpu) != c2r(svc->vcpu->processor) );
+    BUG_ON( c2r(ops, cpu) != c2r(ops, svc->vcpu->processor) );
 
     pos = __runq_insert(runq, svc);
 
@@ -324,11 +323,12 @@ void burn_credits(struct csched_runqueue
 /* Check to see if the item on the runqueue is higher priority than what's
  * currently running; if so, wake up the processor */
 static /*inline*/ void
-runq_tickle(unsigned int cpu, struct csched_vcpu *new, s_time_t now)
+runq_tickle(struct scheduler *ops, unsigned int cpu, struct csched_vcpu *new, 
s_time_t now)
 {
     int i, ipid=-1;
     s_time_t lowest=(1<<30);
-    struct csched_runqueue_data *rqd = RQD(cpu);
+    struct csched_runqueue_data *rqd = RQD(ops, cpu);
+    cpumask_t *online;
 
     d2printk("rqt d%dv%d cd%dv%d\n",
              new->vcpu->domain->domain_id,
@@ -336,13 +336,14 @@ runq_tickle(unsigned int cpu, struct csc
              current->domain->domain_id,
              current->vcpu_id);
 
+    online = CSCHED_CPUONLINE(per_cpu(cpupool, cpu));
     /* Find the cpu in this queue group that has the lowest credits */
     for ( i=rqd->cpu_min ; i < rqd->cpu_max ; i++ )
     {
         struct csched_vcpu * cur;
 
         /* Skip cpus that aren't online */
-        if ( !cpu_online(i) )
+        if ( !cpu_isset(i, *online) )
             continue;
 
         cur = CSCHED_VCPU(per_cpu(schedule_data, i).curr);
@@ -396,11 +397,11 @@ runq_tickle(unsigned int cpu, struct csc
 /*
  * Credit-related code
  */
-static void reset_credit(int cpu, s_time_t now)
+static void reset_credit(struct scheduler *ops, int cpu, s_time_t now)
 {
     struct list_head *iter;
 
-    list_for_each( iter, &RQD(cpu)->svc )
+    list_for_each( iter, &RQD(ops, cpu)->svc )
     {
         struct csched_vcpu * svc = list_entry(iter, struct csched_vcpu, 
rqd_elem);
 
@@ -521,64 +522,100 @@ __csched_vcpu_check(struct vcpu *vc)
 #define CSCHED_VCPU_CHECK(_vc)
 #endif
 
-static int
-csched_vcpu_init(struct vcpu *vc)
+static void *
+csched_alloc_vdata(struct scheduler *ops, struct vcpu *vc, void *dd)
 {
-    struct domain * const dom = vc->domain;
-    struct csched_dom *sdom = CSCHED_DOM(dom);
     struct csched_vcpu *svc;
-
-    printk("%s: Initializing d%dv%d\n",
-           __func__, dom->domain_id, vc->vcpu_id);
 
     /* Allocate per-VCPU info */
     svc = xmalloc(struct csched_vcpu);
     if ( svc == NULL )
-        return -1;
+        return NULL;
+    memset(svc, 0, sizeof(*svc));
 
     INIT_LIST_HEAD(&svc->rqd_elem);
     INIT_LIST_HEAD(&svc->sdom_elem);
     INIT_LIST_HEAD(&svc->runq_elem);
 
-    svc->sdom = sdom;
+    svc->sdom = dd;
     svc->vcpu = vc;
     svc->flags = 0U;
-    vc->sched_priv = svc;
 
     if ( ! is_idle_vcpu(vc) )
     {
-        BUG_ON( sdom == NULL );
+        BUG_ON( svc->sdom == NULL );
 
         svc->credit = CSCHED_CREDIT_INIT;
-        svc->weight = sdom->weight;
+        svc->weight = svc->sdom->weight;
+    }
+    else
+    {
+        BUG_ON( svc->sdom != NULL );
+        svc->credit = CSCHED_IDLE_CREDIT;
+        svc->weight = 0;
+    }
 
+    return svc;
+}
+
+static void
+csched_vcpu_insert(struct scheduler *ops, struct vcpu *vc)
+{
+    struct csched_vcpu *svc = vc->sched_priv;
+    struct domain * const dom = vc->domain;
+    struct csched_dom *sdom = CSCHED_DOM(dom);
+
+    printk("%s: Inserting d%dv%d\n",
+           __func__, dom->domain_id, vc->vcpu_id);
+
+    if ( ! is_idle_vcpu(vc) )
+    {
         /* FIXME: Do we need the private lock here? */
-        list_add_tail(&svc->sdom_elem, &sdom->vcpu);
+        list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu);
 
         /* Add vcpu to runqueue of initial processor */
         /* FIXME: Abstract for multiple runqueues */
         vcpu_schedule_lock_irq(vc);
 
-        list_add_tail(&svc->rqd_elem, &RQD(vc->processor)->svc);
-        update_max_weight(RQD(vc->processor), svc->weight, 0);
+        list_add_tail(&svc->rqd_elem, &RQD(ops, vc->processor)->svc);
+        update_max_weight(RQD(ops, vc->processor), svc->weight, 0);
 
         vcpu_schedule_unlock_irq(vc);
 
         sdom->nr_vcpus++;
     }
-    else
-    {
-        BUG_ON( sdom != NULL );
-        svc->credit = CSCHED_IDLE_CREDIT;
-        svc->weight = 0;
-    }
 
     CSCHED_VCPU_CHECK(vc);
-    return 0;
 }
 
 static void
-csched_vcpu_destroy(struct vcpu *vc)
+csched_free_vdata(struct scheduler *ops, void *priv)
+{
+    struct csched_vcpu *svc = priv;
+    struct vcpu *vc = svc->vcpu;
+
+    if ( ! is_idle_vcpu(vc) )
+    {
+        /* Remove from runqueue */
+        vcpu_schedule_lock_irq(vc);
+
+        list_del_init(&svc->rqd_elem);
+        update_max_weight(RQD(ops, vc->processor), 0, svc->weight);
+
+        vcpu_schedule_unlock_irq(vc);
+
+        /* Remove from sdom list.  Don't need a lock for this, as it's called
+         * syncronously when nothing else can happen. */
+        list_del_init(&svc->sdom_elem);
+
+        svc->sdom->nr_vcpus--;
+    }
+
+    xfree(svc);
+}
+
+static void
+csched_vcpu_destroy(struct scheduler *ops, struct vcpu *vc)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(vc);
     struct csched_dom * const sdom = svc->sdom;
@@ -586,25 +623,11 @@ csched_vcpu_destroy(struct vcpu *vc)
     BUG_ON( sdom == NULL );
     BUG_ON( !list_empty(&svc->runq_elem) );
 
-    /* Remove from runqueue */
-    vcpu_schedule_lock_irq(vc);
-
-    list_del_init(&svc->rqd_elem);
-    update_max_weight(RQD(vc->processor), 0, svc->weight);
-
-    vcpu_schedule_unlock_irq(vc);
-
-    /* Remove from sdom list.  Don't need a lock for this, as it's called
-     * syncronously when nothing else can happen. */
-    list_del_init(&svc->sdom_elem);
-
-    sdom->nr_vcpus--;
-
-    xfree(svc);
+    csched_free_vdata(ops, svc);
 }
 
 static void
-csched_vcpu_sleep(struct vcpu *vc)
+csched_vcpu_sleep(struct scheduler *ops, struct vcpu *vc)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(vc);
 
@@ -617,7 +640,7 @@ csched_vcpu_sleep(struct vcpu *vc)
 }
 
 static void
-csched_vcpu_wake(struct vcpu *vc)
+csched_vcpu_wake(struct scheduler *ops, struct vcpu *vc)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(vc);
     const unsigned int cpu = vc->processor;
@@ -654,8 +677,8 @@ csched_vcpu_wake(struct vcpu *vc)
     now = NOW();
 
     /* Put the VCPU on the runq */
-    runq_insert(cpu, svc);
-    runq_tickle(cpu, svc, now);
+    runq_insert(ops, cpu, svc);
+    runq_tickle(ops, cpu, svc, now);
 
 out:
     d2printk("w-\n");
@@ -663,7 +686,7 @@ out:
 }
 
 static void
-csched_context_saved(struct vcpu *vc)
+csched_context_saved(struct scheduler *ops, struct vcpu *vc)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(vc);
 
@@ -688,15 +711,15 @@ csched_context_saved(struct vcpu *vc)
 
         BUG_ON(__vcpu_on_runq(svc));
 
-        runq_insert(cpu, svc);
-        runq_tickle(cpu, svc, NOW());
+        runq_insert(ops, cpu, svc);
+        runq_tickle(ops, cpu, svc, NOW());
     }
 
     vcpu_schedule_unlock_irq(vc);
 }
 
 static int
-csched_cpu_pick(struct vcpu *vc)
+csched_cpu_pick(struct scheduler *ops, struct vcpu *vc)
 {
     /* FIXME: Chose a schedule group based on load */
     /* FIXME: Migrate the vcpu to the new runqueue list, updating
@@ -706,10 +729,12 @@ csched_cpu_pick(struct vcpu *vc)
 
 static int
 csched_dom_cntl(
+    struct scheduler *ops,
     struct domain *d,
     struct xen_domctl_scheduler_op *op)
 {
     struct csched_dom * const sdom = CSCHED_DOM(d);
+    struct csched_private *prv = CSCHED_PRIV(ops);
     unsigned long flags;
 
     if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
@@ -727,7 +752,7 @@ csched_dom_cntl(
 
             /* Must hold csched_priv lock to update sdom, runq lock to
              * update csvcs. */
-            spin_lock_irqsave(&csched_priv.lock, flags);
+            spin_lock_irqsave(&prv->lock, flags);
 
             old_weight = sdom->weight;
 
@@ -744,32 +769,28 @@ csched_dom_cntl(
                 vcpu_schedule_lock_irq(svc->vcpu);
 
                 svc->weight = sdom->weight;
-                update_max_weight(RQD(svc->vcpu->processor), svc->weight, 
old_weight);
+                update_max_weight(RQD(ops, svc->vcpu->processor), svc->weight, 
old_weight);
 
                 vcpu_schedule_unlock_irq(svc->vcpu);
             }
 
-            spin_unlock_irqrestore(&csched_priv.lock, flags);
+            spin_unlock_irqrestore(&prv->lock, flags);
         }
     }
 
     return 0;
 }
 
-static int
-csched_dom_init(struct domain *dom)
+static void *
+csched_alloc_domdata(struct scheduler *ops, struct domain *dom)
 {
     struct csched_dom *sdom;
     int flags;
 
-    printk("%s: Initializing domain %d\n", __func__, dom->domain_id);
-
-    if ( is_idle_domain(dom) )
-        return 0;
-
     sdom = xmalloc(struct csched_dom);
     if ( sdom == NULL )
-        return -ENOMEM;
+        return NULL;
+    memset(sdom, 0, sizeof(*sdom));
 
     /* Initialize credit and weight */
     INIT_LIST_HEAD(&sdom->vcpu);
@@ -778,40 +799,65 @@ csched_dom_init(struct domain *dom)
     sdom->weight = CSCHED_DEFAULT_WEIGHT;
     sdom->nr_vcpus = 0;
 
+    spin_lock_irqsave(&CSCHED_PRIV(ops)->lock, flags);
+
+    list_add_tail(&sdom->sdom_elem, &CSCHED_PRIV(ops)->sdom);
+
+    spin_unlock_irqrestore(&CSCHED_PRIV(ops)->lock, flags);
+
+    return (void *)sdom;
+}
+
+static int
+csched_dom_init(struct scheduler *ops, struct domain *dom)
+{
+    struct csched_dom *sdom;
+
+    printk("%s: Initializing domain %d\n", __func__, dom->domain_id);
+
+    if ( is_idle_domain(dom) )
+        return 0;
+
+    sdom = csched_alloc_domdata(ops, dom);
+    if ( sdom == NULL )
+        return -ENOMEM;
+
     dom->sched_priv = sdom;
-
-    spin_lock_irqsave(&csched_priv.lock, flags);
-
-    list_add_tail(&sdom->sdom_elem, &csched_priv.sdom);
-
-    spin_unlock_irqrestore(&csched_priv.lock, flags);
 
     return 0;
 }
 
 static void
-csched_dom_destroy(struct domain *dom)
+csched_free_domdata(struct scheduler *ops, void *data)
+{
+    int flags;
+    struct csched_dom *sdom = data;
+
+    spin_lock_irqsave(&CSCHED_PRIV(ops)->lock, flags);
+
+    list_del_init(&sdom->sdom_elem);
+
+    spin_unlock_irqrestore(&CSCHED_PRIV(ops)->lock, flags);
+
+    xfree(data);
+}
+
+static void
+csched_dom_destroy(struct scheduler *ops, struct domain *dom)
 {
     struct csched_dom *sdom = CSCHED_DOM(dom);
-    int flags;
 
     BUG_ON(!list_empty(&sdom->vcpu));
 
-    spin_lock_irqsave(&csched_priv.lock, flags);
-
-    list_del_init(&sdom->sdom_elem);
-
-    spin_unlock_irqrestore(&csched_priv.lock, flags);
-
-    xfree(CSCHED_DOM(dom));
+    csched_free_domdata(ops, CSCHED_DOM(dom));
 }
 
 /* How long should we let this vcpu run for? */
 static s_time_t
-csched_runtime(int cpu, struct csched_vcpu *snext)
+csched_runtime(struct scheduler *ops, int cpu, struct csched_vcpu *snext)
 {
     s_time_t time = CSCHED_MAX_TIMER;
-    struct csched_runqueue_data *rqd = RQD(cpu);
+    struct csched_runqueue_data *rqd = RQD(ops, cpu);
     struct list_head *runq = &rqd->runq;
 
     if ( is_idle_vcpu(snext->vcpu) )
@@ -851,10 +897,10 @@ void __dump_execstate(void *unused);
  * fast for the common case.
  */
 static struct task_slice
-csched_schedule(s_time_t now)
+csched_schedule(struct scheduler *ops, s_time_t now)
 {
     const int cpu = smp_processor_id();
-    struct csched_runqueue_data *rqd = RQD(cpu);
+    struct csched_runqueue_data *rqd = RQD(ops, cpu);
     struct list_head * const runq = &rqd->runq;
     struct csched_vcpu * const scurr = CSCHED_VCPU(current);
     struct csched_vcpu *snext = NULL;
@@ -927,7 +973,7 @@ csched_schedule(s_time_t now)
     }
 
     if ( !is_idle_vcpu(snext->vcpu) && snext->credit <= CSCHED_CREDIT_RESET )
-        reset_credit(cpu, now);
+        reset_credit(ops, cpu, now);
 
 #if 0
     /*
@@ -955,7 +1001,7 @@ csched_schedule(s_time_t now)
     /*
      * Return task to run next...
      */
-    ret.time = csched_runtime(cpu, snext);
+    ret.time = csched_runtime(ops, cpu, snext);
     ret.task = snext->vcpu;
 
     CSCHED_VCPU_CHECK(ret.task);
@@ -977,7 +1023,7 @@ csched_dump_vcpu(struct csched_vcpu *svc
 }
 
 static void
-csched_dump_pcpu(int cpu)
+csched_dump_pcpu(struct scheduler *ops, int cpu)
 {
     struct list_head *runq, *iter;
     struct csched_vcpu *svc;
@@ -986,7 +1032,7 @@ csched_dump_pcpu(int cpu)
 
     /* FIXME: Do locking properly for access to runqueue structures */
 
-    runq = &RQD(cpu)->runq;
+    runq = &RQD(ops, cpu)->runq;
 
     cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_map,cpu));
     printk(" sibling=%s, ", cpustr);
@@ -1014,22 +1060,23 @@ csched_dump_pcpu(int cpu)
 }
 
 static void
-csched_dump(void)
+csched_dump(struct scheduler *ops)
 {
     struct list_head *iter_sdom, *iter_svc;
+    struct csched_private *prv = CSCHED_PRIV(ops);
     int loop;
 
     printk("info:\n"
            "\tncpus              = %u\n"
            "\tdefault-weight     = %d\n",
-           csched_priv.ncpus,
+           prv->ncpus,
            CSCHED_DEFAULT_WEIGHT);
 
     /* FIXME: Locking! */
 
     printk("active vcpus:\n");
     loop = 0;
-    list_for_each( iter_sdom, &csched_priv.sdom )
+    list_for_each( iter_sdom, &prv->sdom )
     {
         struct csched_dom *sdom;
         sdom = list_entry(iter_sdom, struct csched_dom, sdom_elem);
@@ -1046,42 +1093,49 @@ csched_dump(void)
 }
 
 static void
-make_runq_map(void)
+make_runq_map(struct scheduler *ops)
 {
     int cpu, cpu_count=0;
+    struct csched_private *prv = CSCHED_PRIV(ops);
 
     /* FIXME: Read pcpu layout and do this properly */
     for_each_possible_cpu( cpu )
     {
-        csched_priv.runq_map[cpu] = 0;
+        prv->runq_map[cpu] = 0;
         cpu_count++;
     }
-    csched_priv.runq_count = 1;
+    prv->runq_count = 1;
 
     /* Move to the init code...? */
-    csched_priv.rqd[0].cpu_min = 0;
-    csched_priv.rqd[0].cpu_max = cpu_count;
+    prv->rqd[0].cpu_min = 0;
+    prv->rqd[0].cpu_max = cpu_count;
 }
 
-static void
-csched_init(void)
+static int
+csched_init(struct scheduler *ops, int pool0)
 {
     int i;
+    struct csched_private *prv;
 
     printk("Initializing Credit2 scheduler\n" \
            " WARNING: This is experimental software in development.\n" \
            " Use at your own risk.\n");
 
-    spin_lock_init(&csched_priv.lock);
-    INIT_LIST_HEAD(&csched_priv.sdom);
+    prv = xmalloc(struct csched_private);
+    if ( prv == NULL )
+        return 1;
+    memset(prv, 0, sizeof(*prv));
 
-    csched_priv.ncpus = 0;
+    spin_lock_init(&prv->lock);
+    INIT_LIST_HEAD(&prv->sdom);
 
-    make_runq_map();
+    prv->ncpus = 0;
 
-    for ( i=0; i<csched_priv.runq_count ; i++ )
+    make_runq_map(ops);
+
+    for ( i=0; i<prv->runq_count ; i++ )
     {
-        struct csched_runqueue_data *rqd = csched_priv.rqd + i;
+        struct csched_runqueue_data *rqd = prv->rqd + i;
 
         rqd->max_weight = 1;
         rqd->id = i;
@@ -1096,24 +1150,40 @@ csched_init(void)
         spinlock_t *lock;
 
         /* Point the per-cpu schedule lock to the runq_id lock */
-        runq_id = csched_priv.runq_map[i];
+        runq_id = prv->runq_map[i];
         lock = &per_cpu(schedule_data, runq_id)._lock;
 
         per_cpu(schedule_data, i).schedule_lock = lock;
 
-        csched_priv.ncpus++;
+        prv->ncpus++;
     }
+
+    return 0;
 }
+
+static void
+csched_deinit(struct scheduler *ops)
+{
+    struct csched_private *prv;
+
+    prv = CSCHED_PRIV(ops);
+    if ( prv != NULL )
+        xfree(prv);
+}
+
+
+static struct csched_private _csched_priv;
 
 struct scheduler sched_credit2_def = {
     .name           = "SMP Credit Scheduler rev2",
     .opt_name       = "credit2",
     .sched_id       = XEN_SCHEDULER_CREDIT2,
+    .sched_data     = &_csched_priv,
 
     .init_domain    = csched_dom_init,
     .destroy_domain = csched_dom_destroy,
 
-    .init_vcpu      = csched_vcpu_init,
+    .insert_vcpu    = csched_vcpu_insert,
     .destroy_vcpu   = csched_vcpu_destroy,
 
     .sleep          = csched_vcpu_sleep,
@@ -1128,4 +1198,9 @@ struct scheduler sched_credit2_def = {
     .dump_cpu_state = csched_dump_pcpu,
     .dump_settings  = csched_dump,
     .init           = csched_init,
+    .deinit         = csched_deinit,
+    .alloc_vdata    = csched_alloc_vdata,
+    .free_vdata     = csched_free_vdata,
+    .alloc_domdata  = csched_alloc_domdata,
+    .free_domdata   = csched_free_domdata,
 };
diff -r fadf63ab49e7 xen/common/sched_sedf.c
--- a/xen/common/sched_sedf.c   Mon Apr 19 17:57:28 2010 +0100
+++ b/xen/common/sched_sedf.c   Tue Apr 20 11:10:40 2010 +0200
@@ -20,6 +20,9 @@
         if ( (_f) <= SEDFLEVEL )                \
             printk(_a );                        \
     } while ( 0 )
+
+#define SEDF_CPUONLINE(_pool)                                             \
+    (((_pool) == NULL) ? &cpupool_free_cpus : &(_pool)->cpu_valid)
 
 #ifndef NDEBUG
 #define SEDF_STATS
@@ -132,7 +135,7 @@ struct sedf_cpu_info {
 #define sedf_runnable(edom)  (!(EDOM_INFO(edom)->status & SEDF_ASLEEP))
 
 
-static void sedf_dump_cpu_state(int i);
+static void sedf_dump_cpu_state(struct scheduler *ops, int i);
 
 static inline int extraq_on(struct vcpu *d, int i)
 {
@@ -329,30 +332,17 @@ static inline void __add_to_runqueue_sor
 }
 
 
-static int sedf_init_vcpu(struct vcpu *v)
+static void *sedf_alloc_vdata(struct scheduler *ops, struct vcpu *v, void *dd)
 {
     struct sedf_vcpu_info *inf;
 
-    if ( (v->sched_priv = xmalloc(struct sedf_vcpu_info)) == NULL )
-        return -1;
-    memset(v->sched_priv, 0, sizeof(struct sedf_vcpu_info));
+    inf = xmalloc(struct sedf_vcpu_info);
+    if ( inf == NULL )
+        return NULL;
 
-    inf = EDOM_INFO(v);
+    memset(inf, 0, sizeof(struct sedf_vcpu_info));
     inf->vcpu = v;
- 
-    /* Allocate per-CPU context if this is the first domain to be added. */
-    if ( unlikely(per_cpu(schedule_data, v->processor).sched_priv == NULL) )
-    {
-        per_cpu(schedule_data, v->processor).sched_priv = 
-            xmalloc(struct sedf_cpu_info);
-        BUG_ON(per_cpu(schedule_data, v->processor).sched_priv == NULL);
-        memset(CPU_INFO(v->processor), 0, sizeof(*CPU_INFO(v->processor)));
-        INIT_LIST_HEAD(WAITQ(v->processor));
-        INIT_LIST_HEAD(RUNQ(v->processor));
-        INIT_LIST_HEAD(EXTRAQ(v->processor,EXTRA_PEN_Q));
-        INIT_LIST_HEAD(EXTRAQ(v->processor,EXTRA_UTIL_Q));
-    }
-       
+
     /* Every VCPU gets an equal share of extratime by default. */
     inf->deadl_abs   = 0;
     inf->latency     = 0;
@@ -383,39 +373,88 @@ static int sedf_init_vcpu(struct vcpu *v
     }
     else
     {
-        EDOM_INFO(v)->deadl_abs = 0;
-        EDOM_INFO(v)->status &= ~SEDF_ASLEEP;
+        inf->deadl_abs = 0;
+        inf->status &= ~SEDF_ASLEEP;
     }
+
+    return inf;
+}
+
+static void *
+sedf_alloc_pdata(struct scheduler *ops, int cpu)
+{
+    struct sedf_cpu_info *spc;
+
+    spc = xmalloc(struct sedf_cpu_info);
+    BUG_ON(spc == NULL);
+    memset(spc, 0, sizeof(*spc));
+    INIT_LIST_HEAD(&spc->waitq);
+    INIT_LIST_HEAD(&spc->runnableq);
+    INIT_LIST_HEAD(&spc->extraq[EXTRA_PEN_Q]);
+    INIT_LIST_HEAD(&spc->extraq[EXTRA_UTIL_Q]);
+
+    return (void *)spc;
+}
+
+static void
+sedf_free_pdata(struct scheduler *ops, void *spc, int cpu)
+{
+    if ( spc == NULL )
+        return;
+
+    xfree(spc);
+}
+
+static void sedf_free_vdata(struct scheduler *ops, void *priv)
+{
+    xfree(priv);
+}
+
+static void sedf_destroy_vcpu(struct scheduler *ops, struct vcpu *v)
+{
+    sedf_free_vdata(ops, v->sched_priv);
+}
+
+static void *
+sedf_alloc_domdata(struct scheduler *ops, struct domain *d)
+{
+    void *mem;
+
+    mem = xmalloc(struct sedf_dom_info);
+    if ( mem == NULL )
+        return NULL;
+
+    memset(mem, 0, sizeof(struct sedf_dom_info));
+
+    return mem;
+}
+
+static int sedf_init_domain(struct scheduler *ops, struct domain *d)
+{
+    d->sched_priv = sedf_alloc_domdata(ops, d);
+    if ( d->sched_priv == NULL )
+        return -ENOMEM;
 
     return 0;
 }
 
-static void sedf_destroy_vcpu(struct vcpu *v)
+static void sedf_free_domdata(struct scheduler *ops, void *data)
 {
-    xfree(v->sched_priv);
+    xfree(data);
 }
 
-static int sedf_init_domain(struct domain *d)
+static void sedf_destroy_domain(struct scheduler *ops, struct domain *d)
 {
-    d->sched_priv = xmalloc(struct sedf_dom_info);
-    if ( d->sched_priv == NULL )
-        return -ENOMEM;
-
-    memset(d->sched_priv, 0, sizeof(struct sedf_dom_info));
-
-    return 0;
+    sedf_free_domdata(ops, d->sched_priv);
 }
 
-static void sedf_destroy_domain(struct domain *d)
-{
-    xfree(d->sched_priv);
-}
-
-static int sedf_pick_cpu(struct vcpu *v)
+static int sedf_pick_cpu(struct scheduler *ops, struct vcpu *v)
 {
     cpumask_t online_affinity;
+    cpumask_t *online;
 
-    cpus_and(online_affinity, v->cpu_affinity, cpu_online_map);
+    online = SEDF_CPUONLINE(v->domain->cpupool);
+    cpus_and(online_affinity, v->cpu_affinity, *online);
     return first_cpu(online_affinity);
 }
 
@@ -751,7 +790,7 @@ static struct task_slice sedf_do_extra_s
    -timeslice for the current period used up
    -domain on waitqueue has started it's period
    -and various others ;) in general: determine which domain to run next*/
-static struct task_slice sedf_do_schedule(s_time_t now)
+static struct task_slice sedf_do_schedule(struct scheduler *ops, s_time_t now)
 {
     int                   cpu      = smp_processor_id();
     struct list_head     *runq     = RUNQ(cpu);
@@ -786,6 +825,13 @@ static struct task_slice sedf_do_schedul
     }
  check_waitq:
     update_queues(now, runq, waitq);
+
+    if ( unlikely(!cpu_isset(cpu, *SEDF_CPUONLINE(per_cpu(cpupool, cpu)))) )
+    {
+        ret.task = IDLETASK(cpu);
+        ret.time = SECONDS(1);
+        goto sched_done;
+    }
  
     /*now simply pick the first domain from the runqueue, which has the
       earliest deadline, because the list is sorted*/
@@ -824,6 +870,7 @@ static struct task_slice sedf_do_schedul
                                      extraq, cpu);
     }
 
+  sched_done:
     /*TODO: Do something USEFUL when this happens and find out, why it
       still can happen!!!*/
     if ( ret.time < 0)
@@ -841,7 +888,7 @@ static struct task_slice sedf_do_schedul
 }
 
 
-static void sedf_sleep(struct vcpu *d)
+static void sedf_sleep(struct scheduler *ops, struct vcpu *d)
 {
     PRINT(2,"sedf_sleep was called, domain-id %i.%i\n",
           d->domain->domain_id, d->vcpu_id);
@@ -1060,7 +1107,7 @@ static inline int should_switch(struct v
     return 1;
 }
 
-static void sedf_wake(struct vcpu *d)
+static void sedf_wake(struct scheduler *ops, struct vcpu *d)
 {
     s_time_t              now = NOW();
     struct sedf_vcpu_info* inf = EDOM_INFO(d);
@@ -1213,8 +1260,8 @@ static void sedf_dump_domain(struct vcpu
 }
 
 
-/* dumps all domains on hte specified cpu */
-static void sedf_dump_cpu_state(int i)
+/* dumps all domains on the specified cpu */
+static void sedf_dump_cpu_state(struct scheduler *ops, int i)
 {
     struct list_head      *list, *queue, *tmp;
     struct sedf_vcpu_info *d_inf;
@@ -1287,7 +1334,7 @@ static void sedf_dump_cpu_state(int i)
 
 
 /* Adjusts periods and slices of the domains accordingly to their weights. */
-static int sedf_adjust_weights(struct xen_domctl_scheduler_op *cmd)
+static int sedf_adjust_weights(struct cpupool *c, struct 
xen_domctl_scheduler_op *cmd)
 {
     struct vcpu *p;
     struct domain      *d;
@@ -1308,6 +1355,8 @@ static int sedf_adjust_weights(struct xe
     rcu_read_lock(&domlist_read_lock);
     for_each_domain( d )
     {
+        if ( c != d->cpupool )
+            continue;
         for_each_vcpu( d, p )
         {
             if ( EDOM_INFO(p)->weight )
@@ -1359,7 +1408,7 @@ static int sedf_adjust_weights(struct xe
 
 
 /* set or fetch domain scheduling parameters */
-static int sedf_adjust(struct domain *p, struct xen_domctl_scheduler_op *op)
+static int sedf_adjust(struct scheduler *ops, struct domain *p, struct 
xen_domctl_scheduler_op *op)
 {
     struct vcpu *v;
     int rc;
@@ -1368,9 +1417,6 @@ static int sedf_adjust(struct domain *p,
           "new slice %"PRIu64"\nlatency %"PRIu64" extra:%s\n",
           p->domain_id, op->u.sedf.period, op->u.sedf.slice,
           op->u.sedf.latency, (op->u.sedf.extratime)?"yes":"no");
-
-    if ( !p->vcpu )
-        return -EINVAL;
 
     if ( op->cmd == XEN_DOMCTL_SCHEDOP_putinfo )
     {
@@ -1421,7 +1467,7 @@ static int sedf_adjust(struct domain *p,
             }
         }
 
-        rc = sedf_adjust_weights(op);
+        rc = sedf_adjust_weights(p->cpupool, op);
         if ( rc )
             return rc;
 
@@ -1449,7 +1495,7 @@ static int sedf_adjust(struct domain *p,
     return 0;
 }
 
-const struct scheduler sched_sedf_def = {
+struct scheduler sched_sedf_def = {
     .name     = "Simple EDF Scheduler",
     .opt_name = "sedf",
     .sched_id = XEN_SCHEDULER_SEDF,
@@ -1457,8 +1503,14 @@ const struct scheduler sched_sedf_def = 
     .init_domain    = sedf_init_domain,
     .destroy_domain = sedf_destroy_domain,
 
-    .init_vcpu      = sedf_init_vcpu,
     .destroy_vcpu   = sedf_destroy_vcpu,
+
+    .alloc_vdata    = sedf_alloc_vdata,
+    .free_vdata     = sedf_free_vdata,
+    .alloc_pdata    = sedf_alloc_pdata,
+    .free_pdata     = sedf_free_pdata,
+    .alloc_domdata  = sedf_alloc_domdata,
+    .free_domdata   = sedf_free_domdata,
 
     .do_schedule    = sedf_do_schedule,
     .pick_cpu       = sedf_pick_cpu,
diff -r fadf63ab49e7 xen/common/schedule.c
--- a/xen/common/schedule.c     Mon Apr 19 17:57:28 2010 +0100
+++ b/xen/common/schedule.c     Tue Apr 20 11:10:40 2010 +0200
@@ -53,11 +53,12 @@ static void poll_timer_fn(void *data);
 
 /* This is global for now so that private implementations can reach it */
 DEFINE_PER_CPU(struct schedule_data, schedule_data);
+DEFINE_PER_CPU(struct scheduler *, scheduler);
 
 extern const struct scheduler sched_sedf_def;
 extern const struct scheduler sched_credit_def;
 extern const struct scheduler sched_credit2_def;
-static const struct scheduler *__initdata schedulers[] = {
+static const struct scheduler *schedulers[] = {
     &sched_sedf_def,
     &sched_credit_def,
     &sched_credit2_def,
@@ -66,9 +67,15 @@ static const struct scheduler *__initdat
 
 static struct scheduler __read_mostly ops;
 
-#define SCHED_OP(fn, ...)                                 \
-         (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ )      \
-          : (typeof(ops.fn(__VA_ARGS__)))0 )
+#define SCHED_OP(opsptr, fn, ...)                                          \
+         (( (opsptr)->fn != NULL ) ? (opsptr)->fn(opsptr, ##__VA_ARGS__ )  \
+          : (typeof((opsptr)->fn(opsptr, ##__VA_ARGS__)))0 )
+
+#define DOM2OP(_d)    (((_d)->cpupool == NULL) ? &ops : 
&((_d)->cpupool->sched))
+#define VCPU2OP(_v)   (DOM2OP((_v)->domain))
+#define VCPU2ONLINE(_v)                                                    \
+         (((_v)->domain->cpupool == NULL) ? &cpu_online_map                \
+         : &(_v)->domain->cpupool->cpu_valid)
 
 static inline void trace_runstate_change(struct vcpu *v, int new_state)
 {
@@ -209,7 +216,86 @@ int sched_init_vcpu(struct vcpu *v, unsi
 
     TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id);
 
-    return SCHED_OP(init_vcpu, v);
+    if ( unlikely(per_cpu(schedule_data, v->processor).sched_priv == NULL) )
+    {
+        per_cpu(schedule_data, v->processor).sched_priv =
+            SCHED_OP(DOM2OP(d), alloc_pdata, processor);
+        if ( per_cpu(schedule_data, v->processor).sched_priv == NULL )
+            return 1;
+    }
+
+    v->sched_priv = SCHED_OP(DOM2OP(d), alloc_vdata, v, d->sched_priv);
+    if ( v->sched_priv == NULL )
+        return 1;
+
+    if ( is_idle_domain(d) )
+        per_cpu(schedule_data, v->processor).sched_idlevpriv = v->sched_priv;
+
+    return 0;
+}
+
+int sched_move_domain(struct domain *d, struct cpupool *c)
+{
+    struct vcpu *v;
+    unsigned int new_p;
+    void **vcpu_priv;
+    void *domdata;
+
+    domdata = SCHED_OP(&(c->sched), alloc_domdata, d);
+    if ( domdata == NULL )
+        return -ENOMEM;
+
+    vcpu_priv = xmalloc_array(void *, d->max_vcpus);
+    if ( vcpu_priv == NULL )
+    {
+        SCHED_OP(&(c->sched), free_domdata, domdata);
+        return -ENOMEM;
+    }
+
+    memset(vcpu_priv, 0, d->max_vcpus * sizeof(void *));
+    for_each_vcpu ( d, v )
+    {
+        vcpu_priv[v->vcpu_id] = SCHED_OP(&(c->sched), alloc_vdata, v, domdata);
+        if ( vcpu_priv[v->vcpu_id] == NULL )
+        {
+            for_each_vcpu ( d, v )
+            {
+                if ( vcpu_priv[v->vcpu_id] != NULL )
+                    xfree(vcpu_priv[v->vcpu_id]);
+            }
+            xfree(vcpu_priv);
+            SCHED_OP(&(c->sched), free_domdata, domdata);
+            return -ENOMEM;
+        }
+    }
+
+    domain_pause(d);
+
+    new_p = first_cpu(c->cpu_valid);
+    for_each_vcpu ( d, v )
+    {
+        migrate_timer(&v->periodic_timer, new_p);
+        migrate_timer(&v->singleshot_timer, new_p);
+        migrate_timer(&v->poll_timer, new_p);
+
+        SCHED_OP(VCPU2OP(v), destroy_vcpu, v);
+
+        cpus_setall(v->cpu_affinity);
+        v->processor = new_p;
+        v->sched_priv = vcpu_priv[v->vcpu_id];
+
+        new_p = cycle_cpu(new_p, c->cpu_valid);
+    }
+
+    d->cpupool = c;
+    SCHED_OP(DOM2OP(d), free_domdata, d->sched_priv);
+    d->sched_priv = domdata;
+
+    domain_unpause(d);
+
+    xfree(vcpu_priv);
+
+    return 0;
 }
 
 void sched_destroy_vcpu(struct vcpu *v)
@@ -219,17 +305,17 @@ void sched_destroy_vcpu(struct vcpu *v)
     kill_timer(&v->poll_timer);
     if ( test_and_clear_bool(v->is_urgent) )
         atomic_dec(&per_cpu(schedule_data, v->processor).urgent_count);
-    SCHED_OP(destroy_vcpu, v);
+    SCHED_OP(VCPU2OP(v), destroy_vcpu, v);
 }
 
 int sched_init_domain(struct domain *d)
 {
-    return SCHED_OP(init_domain, d);
+    return SCHED_OP(DOM2OP(d), init_domain, d);
 }
 
 void sched_destroy_domain(struct domain *d)
 {
-    SCHED_OP(destroy_domain, d);
+    SCHED_OP(DOM2OP(d), destroy_domain, d);
 }
 
 void vcpu_sleep_nosync(struct vcpu *v)
@@ -243,7 +329,7 @@ void vcpu_sleep_nosync(struct vcpu *v)
         if ( v->runstate.state == RUNSTATE_runnable )
             vcpu_runstate_change(v, RUNSTATE_offline, NOW());
 
-        SCHED_OP(sleep, v);
+        SCHED_OP(VCPU2OP(v), sleep, v);
     }
 
     vcpu_schedule_unlock_irqrestore(v, flags);
@@ -271,7 +357,7 @@ void vcpu_wake(struct vcpu *v)
     {
         if ( v->runstate.state >= RUNSTATE_blocked )
             vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
-        SCHED_OP(wake, v);
+        SCHED_OP(VCPU2OP(v), wake, v);
     }
     else if ( !test_bit(_VPF_blocked, &v->pause_flags) )
     {
@@ -326,7 +412,7 @@ static void vcpu_migrate(struct vcpu *v)
 
     /* Select new CPU. */
     old_cpu = v->processor;
-    new_cpu = SCHED_OP(pick_cpu, v);
+    new_cpu = SCHED_OP(VCPU2OP(v), pick_cpu, v);
 
     /*
      * Transfer urgency status to new CPU before switching CPUs, as once
@@ -369,19 +455,29 @@ void vcpu_force_reschedule(struct vcpu *
 }
 
 /*
- * This function is used by cpu_hotplug code from stop_machine context.
- * Hence we can avoid needing to take certain locks.
+ * This function is used by cpu_hotplug code from stop_machine context
+ * and from cpupools to switch schedulers on a cpu.
  */
-void cpu_disable_scheduler(void)
+int cpu_disable_scheduler(unsigned int cpu, int lock)
 {
     struct domain *d;
     struct vcpu *v;
-    unsigned int cpu = smp_processor_id();
+    struct cpupool *c;
+    int    ret = 0;
+
+    c = per_cpu(cpupool, cpu);
+    if ( c == NULL )
+        return ret;
 
     for_each_domain ( d )
     {
+        if ( d->cpupool != c )
+            continue;
+
         for_each_vcpu ( d, v )
         {
+            if ( lock != 0 )
+                vcpu_schedule_lock_irq(v);
             if ( (cpus_weight(v->cpu_affinity) == 1) &&
                  cpu_isset(cpu, v->cpu_affinity) )
             {
@@ -395,26 +491,46 @@ void cpu_disable_scheduler(void)
              * be chosen when the timer is next re-set.
              */
             if ( v->singleshot_timer.cpu == cpu )
-                migrate_timer(&v->singleshot_timer, 0);
+            {
+                int cpu_mig;
+
+                cpu_mig = first_cpu(c->cpu_valid);
+                if (cpu_mig == cpu)
+                    cpu_mig = next_cpu(cpu_mig, c->cpu_valid);
+                migrate_timer(&v->singleshot_timer, cpu_mig);
+            }
 
             if ( v->processor == cpu )
             {
                 set_bit(_VPF_migrating, &v->pause_flags);
+                if ( lock != 0 )
+                    vcpu_schedule_unlock_irq(v);
                 vcpu_sleep_nosync(v);
                 vcpu_migrate(v);
             }
+            else if ( lock != 0 )
+                vcpu_schedule_unlock_irq(v);
+            /*
+             * A vcpu active in the hypervisor will not be migratable.
+             * The caller should try again after releasing and reaquiring
+             * all locks.
+             */
+            if ( v->processor == cpu )
+                ret = -EAGAIN;
         }
     }
+    return ret;
 }
 
 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
 {
     cpumask_t online_affinity, old_affinity;
+    cpumask_t *online;
 
     if ( v->domain->is_pinned )
         return -EINVAL;
-
-    cpus_and(online_affinity, *affinity, cpu_online_map);
+    online = VCPU2ONLINE(v);
+    cpus_and(online_affinity, *affinity, *online);
     if ( cpus_empty(online_affinity) )
         return -EINVAL;
 
@@ -723,7 +839,7 @@ long sched_adjust(struct domain *d, stru
     struct vcpu *v;
     long ret;
     
-    if ( (op->sched_id != ops.sched_id) ||
+    if ( (op->sched_id != DOM2OP(d)->sched_id) ||
          ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
           (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) )
         return -EINVAL;
@@ -750,7 +866,7 @@ long sched_adjust(struct domain *d, stru
     if ( d == current->domain )
         vcpu_schedule_lock_irq(current);
 
-    if ( (ret = SCHED_OP(adjust, d, op)) == 0 )
+    if ( (ret = SCHED_OP(DOM2OP(d), adjust, d, op)) == 0 )
         TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
 
     if ( d == current->domain )
@@ -797,6 +913,7 @@ static void schedule(void)
 {
     struct vcpu          *prev = current, *next = NULL;
     s_time_t              now = NOW();
+    struct scheduler     *sched = this_cpu(scheduler);
     struct schedule_data *sd;
     struct task_slice     next_slice;
 
@@ -812,7 +929,7 @@ static void schedule(void)
     stop_timer(&sd->s_timer);
     
     /* get policy-specific decision on scheduling... */
-    next_slice = ops.do_schedule(now);
+    next_slice = sched->do_schedule(sched, now);
 
     next = next_slice.task;
 
@@ -871,6 +988,10 @@ static void schedule(void)
     update_vcpu_system_time(next);
     vcpu_periodic_timer_work(next);
 
+    TRACE_4D(TRC_SCHED_SWITCH,
+             prev->domain->domain_id, prev->vcpu_id,
+             next->domain->domain_id, next->vcpu_id);
+
     context_switch(prev, next);
 }
 
@@ -884,7 +1005,7 @@ void context_saved(struct vcpu *prev)
     /* Check for migration request /after/ clearing running flag. */
     smp_mb();
 
-    SCHED_OP(context_saved, prev);
+    SCHED_OP(VCPU2OP(prev), context_saved, prev);
 
     if ( unlikely(test_bit(_VPF_migrating, &prev->pause_flags)) )
         vcpu_migrate(prev);
@@ -920,20 +1041,25 @@ static void poll_timer_fn(void *data)
         vcpu_unblock(v);
 }
 
+/* Get scheduler by id */
+const struct scheduler *scheduler_get_by_id(unsigned int id)
+{
+    int i;
+
+    for ( i = 0; schedulers[i] != NULL; i++ )
+    {
+        if ( schedulers[i]->sched_id == id )
+            return schedulers[i];
+    }
+    return NULL;
+}
+
 /* Initialise the data structures. */
 void __init scheduler_init(void)
 {
     int i;
 
     open_softirq(SCHEDULE_SOFTIRQ, schedule);
-
-    for_each_possible_cpu ( i )
-    {
-        spin_lock_init(&per_cpu(schedule_data, i)._lock);
-        per_cpu(schedule_data, i).schedule_lock
-            = &per_cpu(schedule_data, i)._lock;
-        init_timer(&per_cpu(schedule_data, i).s_timer, s_timer_fn, NULL, i);
-    }
 
     for ( i = 0; schedulers[i] != NULL; i++ )
     {
@@ -948,43 +1074,125 @@ void __init scheduler_init(void)
         ops = *schedulers[0];
     }
 
+    for_each_possible_cpu ( i )
+    {
+        per_cpu(scheduler, i) = &ops;
+        spin_lock_init(&per_cpu(schedule_data, i)._lock);
+        per_cpu(schedule_data, i).schedule_lock
+            = &per_cpu(schedule_data, i)._lock;
+        init_timer(&per_cpu(schedule_data, i).s_timer, s_timer_fn, NULL, i);
+    }
+
     printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
-    SCHED_OP(init);
+    if ( SCHED_OP(&ops, init, 1) )
+        panic("scheduler returned error on init\n");
 }
 
-void dump_runq(unsigned char key)
+/* switch scheduler on cpu */
+void schedule_cpu_switch(unsigned int cpu, struct cpupool *c)
 {
-    s_time_t      now = NOW();
-    int           i;
     unsigned long flags;
+    struct vcpu *v;
+    void *vpriv = NULL;
+    void *ppriv;
+    void *ppriv_old;
+    struct scheduler *old_ops;
+    struct scheduler *new_ops;
 
-    local_irq_save(flags);
+    old_ops = per_cpu(scheduler, cpu);
+    new_ops = (c == NULL) ? &ops : &(c->sched);
+    v = per_cpu(schedule_data, cpu).idle;
+    ppriv = SCHED_OP(new_ops, alloc_pdata, cpu);
+    if ( c != NULL )
+        vpriv = SCHED_OP(new_ops, alloc_vdata, v, v->domain->sched_priv);
 
-    printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
-    SCHED_OP(dump_settings);
-    printk("sched_smt_power_savings: %s\n",
-            sched_smt_power_savings? "enabled":"disabled");
-    printk("NOW=0x%08X%08X\n",  (u32)(now>>32), (u32)now);
+    spin_lock_irqsave(per_cpu(schedule_data, cpu).schedule_lock, flags);
 
-    for_each_online_cpu ( i )
+    if ( c == NULL )
+    {
+        vpriv = v->sched_priv;
+        v->sched_priv = per_cpu(schedule_data, cpu).sched_idlevpriv;
+    }
+    else
+    {
+        v->sched_priv = vpriv;
+        vpriv = NULL;
+    }
+    SCHED_OP(old_ops, tick_suspend, cpu);
+    per_cpu(scheduler, cpu) = new_ops;
+    ppriv_old = per_cpu(schedule_data, cpu).sched_priv;
+    per_cpu(schedule_data, cpu).sched_priv = ppriv;
+    SCHED_OP(new_ops, tick_resume, cpu);
+    SCHED_OP(new_ops, insert_vcpu, v);
+
+    spin_unlock_irqrestore(per_cpu(schedule_data, cpu).schedule_lock, flags);
+
+    if ( vpriv != NULL )
+        SCHED_OP(old_ops, free_vdata, vpriv);
+    SCHED_OP(old_ops, free_pdata, ppriv_old, cpu);
+}
+
+/* init scheduler global data */
+int schedule_init_global(char *name, struct scheduler *sched)
+{
+    int i;
+    const struct scheduler *data;
+
+    data = &ops;
+    for ( i = 0; (schedulers[i] != NULL) && (name != NULL) ; i++ )
+    {
+        if ( strcmp(schedulers[i]->opt_name, name) == 0 )
+        {
+            data = schedulers[i];
+            break;
+        }
+    }
+    memcpy(sched, data, sizeof(*sched));
+    return SCHED_OP(sched, init, 0);
+}
+
+/* deinitialize scheduler global data */
+void schedule_deinit_global(struct scheduler *sched)
+{
+    SCHED_OP(sched, deinit);
+}
+
+void schedule_dump(struct cpupool *c)
+{
+    int               i;
+    struct scheduler *sched;
+    cpumask_t        *cpus;
+
+    sched = (c == NULL) ? &ops : &(c->sched);
+    cpus = (c == NULL) ? &cpupool_free_cpus : &c->cpu_valid;
+    printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name);
+    SCHED_OP(sched, dump_settings);
+
+    for_each_cpu_mask (i, *cpus)
     {
         spin_lock(per_cpu(schedule_data, i).schedule_lock);
         printk("CPU[%02d] ", i);
-        SCHED_OP(dump_cpu_state, i);
+        SCHED_OP(sched, dump_cpu_state, i);
         spin_unlock(per_cpu(schedule_data, i).schedule_lock);
     }
-
-    local_irq_restore(flags);
 }
 
 void sched_tick_suspend(void)
 {
-    SCHED_OP(tick_suspend);
+    struct scheduler *sched;
+    unsigned int cpu = smp_processor_id();
+
+    sched = per_cpu(scheduler, cpu);
+    SCHED_OP(sched, tick_suspend, cpu);
 }
 
 void sched_tick_resume(void)
 {
-    SCHED_OP(tick_resume);
+    struct scheduler *sched;
+    unsigned int cpu = smp_processor_id();
+
+    sched = per_cpu(scheduler, cpu);
+    SCHED_OP(sched, tick_resume, cpu);
 }
 
 #ifdef CONFIG_COMPAT
diff -r fadf63ab49e7 xen/include/asm-x86/smp.h
--- a/xen/include/asm-x86/smp.h Mon Apr 19 17:57:28 2010 +0100
+++ b/xen/include/asm-x86/smp.h Tue Apr 20 11:10:40 2010 +0200
@@ -56,7 +56,6 @@ extern u32 cpu_2_logical_apicid[];
 #define CPU_ONLINE     0x0002  /* CPU is up */
 #define CPU_DEAD       0x0004  /* CPU is dead */
 DECLARE_PER_CPU(int, cpu_state);
-extern spinlock_t(cpu_add_remove_lock);
 
 #define cpu_is_offline(cpu) unlikely(!cpu_online(cpu))
 extern int cpu_down(unsigned int cpu);
diff -r fadf63ab49e7 xen/include/public/domctl.h
--- a/xen/include/public/domctl.h       Mon Apr 19 17:57:28 2010 +0100
+++ b/xen/include/public/domctl.h       Tue Apr 20 11:10:40 2010 +0200
@@ -35,7 +35,7 @@
 #include "xen.h"
 #include "grant_table.h"
 
-#define XEN_DOMCTL_INTERFACE_VERSION 0x00000006
+#define XEN_DOMCTL_INTERFACE_VERSION 0x00000007
 
 struct xenctl_cpumap {
     XEN_GUEST_HANDLE_64(uint8) bitmap;
@@ -60,10 +60,14 @@ struct xen_domctl_createdomain {
  /* Should domain memory integrity be verifed by tboot during Sx? */
 #define _XEN_DOMCTL_CDF_s3_integrity  2
 #define XEN_DOMCTL_CDF_s3_integrity   (1U<<_XEN_DOMCTL_CDF_s3_integrity)
-    uint32_t flags;
  /* Disable out-of-sync shadow page tables? */
 #define _XEN_DOMCTL_CDF_oos_off       3
 #define XEN_DOMCTL_CDF_oos_off        (1U<<_XEN_DOMCTL_CDF_oos_off)
+ /* cpupool is specified (0 otherwise) */
+#define _XEN_DOMCTL_CDF_pool          4
+#define XEN_DOMCTL_CDF_pool           (1U<<_XEN_DOMCTL_CDF_pool)
+    uint32_t flags;
+    uint32_t cpupool;
 };
 typedef struct xen_domctl_createdomain xen_domctl_createdomain_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_createdomain_t);
@@ -106,6 +110,7 @@ struct xen_domctl_getdomaininfo {
     uint32_t max_vcpu_id;        /* Maximum VCPUID in use by this domain. */
     uint32_t ssidref;
     xen_domain_handle_t handle;
+    uint32_t cpupool;
 };
 typedef struct xen_domctl_getdomaininfo xen_domctl_getdomaininfo_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_getdomaininfo_t);
@@ -785,6 +790,30 @@ typedef struct xen_domctl_mem_sharing_op
 typedef struct xen_domctl_mem_sharing_op xen_domctl_mem_sharing_op_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_mem_sharing_op_t);
 
+/*
+ * cpupool operations
+ */
+/* XEN_DOMCTL_cpupool_op */
+#define XEN_DOMCTL_CPUPOOL_OP_CREATE                1  /* C */
+#define XEN_DOMCTL_CPUPOOL_OP_DESTROY               2  /* D */
+#define XEN_DOMCTL_CPUPOOL_OP_INFO                  3  /* I */
+#define XEN_DOMCTL_CPUPOOL_OP_ADDCPU                4  /* A */
+#define XEN_DOMCTL_CPUPOOL_OP_RMCPU                 5  /* R */
+#define XEN_DOMCTL_CPUPOOL_OP_MOVEDOMAIN            6  /* M */
+#define XEN_DOMCTL_CPUPOOL_OP_FREEINFO              7  /* F */
+#define XEN_DOMCTL_CPUPOOL_PAR_ANY     0xFFFFFFFF
+struct xen_domctl_cpupool_op {
+    uint32_t op;          /* IN */
+    uint32_t cpupool_id;  /* IN: CDIARM OUT: CI */
+    uint32_t sched_id;    /* IN: C      OUT: I  */
+    uint32_t domid;       /* IN: M              */
+    uint32_t cpu;         /* IN: AR             */
+    uint32_t n_dom;       /*            OUT: I  */
+    struct xenctl_cpumap cpumap; /*     OUT: IF */
+};
+typedef struct xen_domctl_cpupool_op xen_domctl_cpupool_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_cpupool_op_t);
+
 
 struct xen_domctl {
     uint32_t cmd;
@@ -846,6 +875,7 @@ struct xen_domctl {
 #define XEN_DOMCTL_gettscinfo                    59
 #define XEN_DOMCTL_settscinfo                    60
 #define XEN_DOMCTL_getpageframeinfo3             61
+#define XEN_DOMCTL_cpupool_op                    62
 #define XEN_DOMCTL_gdbsx_guestmemio            1000
 #define XEN_DOMCTL_gdbsx_pausevcpu             1001
 #define XEN_DOMCTL_gdbsx_unpausevcpu           1002
@@ -894,6 +924,7 @@ struct xen_domctl {
         struct xen_domctl_debug_op          debug_op;
         struct xen_domctl_mem_event_op      mem_event_op;
         struct xen_domctl_mem_sharing_op    mem_sharing_op;
+        struct xen_domctl_cpupool_op        cpupool_op;
 #if defined(__i386__) || defined(__x86_64__)
         struct xen_domctl_cpuid             cpuid;
 #endif
diff -r fadf63ab49e7 xen/include/xen/sched-if.h
--- a/xen/include/xen/sched-if.h        Mon Apr 19 17:57:28 2010 +0100
+++ b/xen/include/xen/sched-if.h        Tue Apr 20 11:10:40 2010 +0200
@@ -9,6 +9,15 @@
 #define __XEN_SCHED_IF_H__
 
 #include <xen/percpu.h>
+
+/* A global pointer to the initial cpupool (POOL0). */
+extern struct cpupool *cpupool0;
+
+/* cpus currently in no cpupool */
+extern cpumask_t cpupool_free_cpus;
+
+/* cpupool lock (used for cpu on/offline, too) */
+extern spinlock_t cpupool_lock;
 
 /*
  * In order to allow a scheduler to remap the lock->cpu mapping,
@@ -26,11 +35,14 @@ struct schedule_data {
     struct vcpu        *curr;           /* current task                    */
     struct vcpu        *idle;           /* idle task for this cpu          */
     void               *sched_priv;
+    void               *sched_idlevpriv; /* default scheduler vcpu data    */
     struct timer        s_timer;        /* scheduling timer                */
     atomic_t            urgent_count;   /* how many urgent vcpus           */
 } __cacheline_aligned;
 
 DECLARE_PER_CPU(struct schedule_data, schedule_data);
+DECLARE_PER_CPU(struct scheduler *, scheduler);
+DECLARE_PER_CPU(struct cpupool *, cpupool);
 
 static inline void vcpu_schedule_lock(struct vcpu *v)
 {
@@ -78,29 +90,50 @@ struct scheduler {
     char *name;             /* full name for this scheduler      */
     char *opt_name;         /* option name for this scheduler    */
     unsigned int sched_id;  /* ID for this scheduler             */
+    void *sched_data;       /* global data pointer               */
 
-    void         (*init)           (void);
+    int          (*init)           (struct scheduler *, int);
+    void         (*deinit)         (struct scheduler *);
 
-    int          (*init_domain)    (struct domain *);
-    void         (*destroy_domain) (struct domain *);
+    void         (*free_vdata)     (struct scheduler *, void *);
+    void *       (*alloc_vdata)    (struct scheduler *, struct vcpu *,
+                                    void *);
+    void         (*free_pdata)     (struct scheduler *, void *, int);
+    void *       (*alloc_pdata)    (struct scheduler *, int);
+    void         (*free_domdata)   (struct scheduler *, void *);
+    void *       (*alloc_domdata)  (struct scheduler *, struct domain *);
 
-    int          (*init_vcpu)      (struct vcpu *);
-    void         (*destroy_vcpu)   (struct vcpu *);
+    int          (*init_domain)    (struct scheduler *, struct domain *);
+    void         (*destroy_domain) (struct scheduler *, struct domain *);
 
-    void         (*sleep)          (struct vcpu *);
-    void         (*wake)           (struct vcpu *);
-    void         (*context_saved)  (struct vcpu *);
+    void         (*insert_vcpu)    (struct scheduler *, struct vcpu *);
+    void         (*destroy_vcpu)   (struct scheduler *, struct vcpu *);
 
-    struct task_slice (*do_schedule) (s_time_t);
+    void         (*sleep)          (struct scheduler *, struct vcpu *);
+    void         (*wake)           (struct scheduler *, struct vcpu *);
+    void         (*context_saved)  (struct scheduler *, struct vcpu *);
 
-    int          (*pick_cpu)       (struct vcpu *);
-    int          (*adjust)         (struct domain *,
+    struct task_slice (*do_schedule) (struct scheduler *, s_time_t);
+
+    int          (*pick_cpu)       (struct scheduler *, struct vcpu *);
+    int          (*adjust)         (struct scheduler *, struct domain *,
                                     struct xen_domctl_scheduler_op *);
-    void         (*dump_settings)  (void);
-    void         (*dump_cpu_state) (int);
+    void         (*dump_settings)  (struct scheduler *);
+    void         (*dump_cpu_state) (struct scheduler *, int);
 
-    void         (*tick_suspend)    (void);
-    void         (*tick_resume)     (void);
+    void         (*tick_suspend)    (struct scheduler *, unsigned int);
+    void         (*tick_resume)     (struct scheduler *, unsigned int);
 };
 
+struct cpupool
+{
+    int              cpupool_id;
+    cpumask_t        cpu_valid;      /* all cpus assigned to pool */
+    struct cpupool   *next;
+    unsigned int     n_dom;
+    struct scheduler sched;
+};
+
+const struct scheduler *scheduler_get_by_id(unsigned int id);
+
 #endif /* __XEN_SCHED_IF_H__ */
diff -r fadf63ab49e7 xen/include/xen/sched.h
--- a/xen/include/xen/sched.h   Mon Apr 19 17:57:28 2010 +0100
+++ b/xen/include/xen/sched.h   Tue Apr 20 11:10:40 2010 +0200
@@ -213,6 +213,7 @@ struct domain
 
     /* Scheduling. */
     void            *sched_priv;    /* scheduler-specific data */
+    struct cpupool  *cpupool;
 
     struct domain   *next_in_list;
     struct domain   *next_in_hashbucket;
@@ -377,7 +378,7 @@ static inline void get_knownalive_domain
 }
 
 struct domain *domain_create(
-    domid_t domid, unsigned int domcr_flags, ssidref_t ssidref);
+    domid_t domid, int poolid, unsigned int domcr_flags, ssidref_t ssidref);
  /* DOMCRF_hvm: Create an HVM domain, as opposed to a PV domain. */
 #define _DOMCRF_hvm           0
 #define DOMCRF_hvm            (1U<<_DOMCRF_hvm)
@@ -465,6 +466,7 @@ void sched_destroy_vcpu(struct vcpu *v);
 void sched_destroy_vcpu(struct vcpu *v);
 int  sched_init_domain(struct domain *d);
 void sched_destroy_domain(struct domain *d);
+int sched_move_domain(struct domain *d, struct cpupool *c);
 long sched_adjust(struct domain *, struct xen_domctl_scheduler_op *);
 int  sched_id(void);
 void sched_tick_suspend(void);
@@ -575,8 +577,13 @@ void domain_unpause_by_systemcontroller(
 void domain_unpause_by_systemcontroller(struct domain *d);
 void cpu_init(void);
 
+struct scheduler;
+
+int schedule_init_global(char *name, struct scheduler *sched);
+void schedule_deinit_global(struct scheduler *sched);
+void schedule_cpu_switch(unsigned int cpu, struct cpupool *c);
 void vcpu_force_reschedule(struct vcpu *v);
-void cpu_disable_scheduler(void);
+int cpu_disable_scheduler(unsigned int cpu, int lock);
 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity);
 
 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate);
@@ -607,6 +614,18 @@ extern enum cpufreq_controller {
     FREQCTL_none, FREQCTL_dom0_kernel, FREQCTL_xen
 } cpufreq_controller;
 
+#define CPUPOOLID_NONE    -1
+
+struct cpupool *cpupool_create(int poolid, char *sched);
+int cpupool_destroy(struct cpupool *c);
+int cpupool0_cpu_assign(struct cpupool *c);
+int cpupool_assign_ncpu(struct cpupool *c, int ncpu);
+void cpupool_cpu_add(unsigned int cpu);
+int cpupool_add_domain(struct domain *d, int poolid);
+void cpupool_rm_domain(struct domain *d);
+int cpupool_do_domctl(struct xen_domctl_cpupool_op *op);
+#define num_cpupool_cpus(c) (cpus_weight((c)->cpu_valid))
+
 #endif /* __SCHED_H__ */
 
 /*
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
<Prev in Thread] Current Thread [Next in Thread>