WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [Patch 1/6] xen: cpupool support - hypervisor support of cpu

To: "xen-devel@xxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxx>
Subject: [Xen-devel] [Patch 1/6] xen: cpupool support - hypervisor support of cpupools
From: Juergen Gross <juergen.gross@xxxxxxxxxxxxxx>
Date: Fri, 17 Apr 2009 11:53:56 +0200
Delivery-date: Fri, 17 Apr 2009 02:56:26 -0700
Dkim-signature: v=1; a=rsa-sha256; c=simple/simple; d=ts.fujitsu.com; i=juergen.gross@xxxxxxxxxxxxxx; q=dns/txt; s=s1536b; t=1239962068; x=1271498068; h=from:sender:reply-to:subject:date:message-id:to:cc: mime-version:content-transfer-encoding:content-id: content-description:resent-date:resent-from:resent-sender: resent-to:resent-cc:resent-message-id:in-reply-to: references:list-id:list-help:list-unsubscribe: list-subscribe:list-post:list-owner:list-archive; z=From:=20Juergen=20Gross=20<juergen.gross@xxxxxxxxxxxxxx> |Subject:=20[Patch=201/6]=20xen:=20cpupool=20support=20- =20hypervisor=20support=20of=20cpupools|Date:=20Fri,=2017 =20Apr=202009=2011:53:56=20+0200|Message-ID:=20<49E851B4. 60703@xxxxxxxxxxxxxx>|To:=20"xen-devel@xxxxxxxxxxxxxxxxxx m"=20<xen-devel@xxxxxxxxxxxxxxxxxxx>|MIME-Version:=201.0; bh=AEewsA/I6j45WaGEvKyUAvfh41c0VEvZe1e5nNp2zNY=; b=QLubKoQ+HsV9EGBBKHcS+WJ0cP/B4ndchP17mqbr2Vsb0vvP00wYSok8 MBNQrKSLL5oz13kbqF16HCZOOS7/NvmgbEibGYKfiZrleacTwoAYpUfL7 TKNlldaeh1qNOI5otldwSk19O78x+olnuum+q29N5M97AzcHm8U2iOXhX fSls9k3prGz8bd/dp0CsJDI0UqGeIW9+9fkmZzkMJAdf7s/egzEGn3ayl QqbOO0OOq3u7Ggh1Ma/UfLz8IN0qd;
Domainkey-signature: s=s1536a; d=ts.fujitsu.com; c=nofws; q=dns; h=X-SBRSScore:X-IronPort-AV:Received:X-IronPort-AV: Received:Received:Message-ID:Date:From:Organization: User-Agent:MIME-Version:To:Subject:X-Enigmail-Version: Content-Type; b=t3WwjMSh2l9diu2v0OapHIDGATMF07rlFYtYYb8loojF6KqkAy1QJbnW X4xamnSSkuWN0eAl8nI7xSmBRaxDfZJ4qOvV++sg8eh24n37hPGBsNO5W LxJkC91XwgF6W4i6bPo9EEtSXZsPzW5a+kZykL8dQTIkOdkBsNU6LhHyf Vs1NyzOkH90m8HgtEMMkF1qHNiI1DxEw+oHnKGg2RfjYN/eytIXecKGUx xQpyPNlgyVblAg4f9CStHxQjfTwJC;
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Organization: Fujitsu Technology Solutions
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
User-agent: Mozilla-Thunderbird 2.0.0.19 (X11/20090103)
Signed-off-by: juergen.gross@xxxxxxxxxxxxxx

-- 
Juergen Gross                 Principal Developer Operating Systems
TSP ES&S SWE OS6                       Telephone: +49 (0) 89 636 47950
Fujitsu Technolgy Solutions               e-mail: juergen.gross@xxxxxxxxxxxxxx
Otto-Hahn-Ring 6                        Internet: ts.fujitsu.com
D-81739 Muenchen                 Company details: ts.fujitsu.com/imprint.html
diff -r 655dc3bc1d8e xen/arch/x86/acpi/cpu_idle.c
--- a/xen/arch/x86/acpi/cpu_idle.c      Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/arch/x86/acpi/cpu_idle.c      Thu Apr 16 15:04:13 2009 +0200
@@ -198,7 +198,7 @@ static void acpi_processor_idle(void)
 
     cpufreq_dbs_timer_suspend();
 
-    sched_tick_suspend();
+    sched_tick_suspend(smp_processor_id());
     /*
      * sched_tick_suspend may raise TIMER_SOFTIRQ by __stop_timer,
      * which will break the later assumption of no sofirq pending,
@@ -216,7 +216,7 @@ static void acpi_processor_idle(void)
     if ( softirq_pending(smp_processor_id()) )
     {
         local_irq_enable();
-        sched_tick_resume();
+        sched_tick_resume(smp_processor_id());
         cpufreq_dbs_timer_resume();
         return;
     }
@@ -237,7 +237,7 @@ static void acpi_processor_idle(void)
             pm_idle_save();
         else
             acpi_safe_halt();
-        sched_tick_resume();
+        sched_tick_resume(smp_processor_id());
         cpufreq_dbs_timer_resume();
         return;
     }
@@ -345,7 +345,7 @@ static void acpi_processor_idle(void)
 
     default:
         local_irq_enable();
-        sched_tick_resume();
+        sched_tick_resume(smp_processor_id());
         cpufreq_dbs_timer_resume();
         return;
     }
@@ -357,7 +357,7 @@ static void acpi_processor_idle(void)
         cx->time += sleep_ticks;
     }
 
-    sched_tick_resume();
+    sched_tick_resume(smp_processor_id());
     cpufreq_dbs_timer_resume();
 
     if ( cpuidle_current_governor->reflect )
diff -r 655dc3bc1d8e xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/arch/x86/domain.c     Thu Apr 09 11:58:17 2009 +0200
@@ -1412,7 +1412,13 @@ struct migrate_info {
     void (*saved_schedule_tail)(struct vcpu *);
     cpumask_t saved_affinity;
     unsigned int nest;
+    int borrowed;
 };
+
+long continue_hypercall_on_cpu_dummy(void *data)
+{
+    return 0;
+}
 
 static void continue_hypercall_on_cpu_helper(struct vcpu *v)
 {
@@ -1420,8 +1426,16 @@ static void continue_hypercall_on_cpu_he
     struct migrate_info *info = v->arch.continue_info;
     cpumask_t mask = info->saved_affinity;
     void (*saved_schedule_tail)(struct vcpu *) = info->saved_schedule_tail;
+    int cpu = -1;
 
     regs->eax = info->func(info->data);
+
+    if ( (info->nest == 0) && info->borrowed &&
+         (cpu = cpupool_return_cpu(v->domain->cpupool) >= 0) )
+    {
+        continue_hypercall_on_cpu(cpu, continue_hypercall_on_cpu_dummy,
+            info->data);
+    }
 
     if ( info->nest-- == 0 )
     {
@@ -1440,27 +1454,32 @@ int continue_hypercall_on_cpu(int cpu, l
     struct migrate_info *info;
     cpumask_t mask = cpumask_of_cpu(cpu);
     int rc;
+    int borrowed = 0;
 
     if ( cpu == smp_processor_id() )
         return func(data);
 
+    borrowed = cpupool_borrow_cpu(v->domain->cpupool, cpu);
+
     info = v->arch.continue_info;
     if ( info == NULL )
     {
         info = xmalloc(struct migrate_info);
+        rc = -ENOMEM;
         if ( info == NULL )
-            return -ENOMEM;
+            goto out;
 
         rc = vcpu_lock_affinity(v, &mask);
         if ( rc )
         {
             xfree(info);
-            return rc;
+            goto out;
         }
 
         info->saved_schedule_tail = v->arch.schedule_tail;
         info->saved_affinity = mask;
         info->nest = 0;
+        info->borrowed = 0;
 
         v->arch.schedule_tail = continue_hypercall_on_cpu_helper;
         v->arch.continue_info = info;
@@ -1470,16 +1489,22 @@ int continue_hypercall_on_cpu(int cpu, l
         BUG_ON(info->nest != 0);
         rc = vcpu_locked_change_affinity(v, &mask);
         if ( rc )
-            return rc;
+            goto out;
         info->nest++;
     }
 
+    info->borrowed += borrowed;
     info->func = func;
     info->data = data;
 
     /* Dummy return value will be overwritten by new schedule_tail. */
     BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id())));
     return 0;
+
+out:
+    if ( borrowed )
+        cpupool_return_cpu(v->domain->cpupool);
+    return rc;
 }
 
 #define next_arg(fmt, args) ({                                              \
diff -r 655dc3bc1d8e xen/arch/x86/domain_build.c
--- a/xen/arch/x86/domain_build.c       Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/arch/x86/domain_build.c       Thu Apr 09 11:58:46 2009 +0200
@@ -9,6 +9,7 @@
 #include <xen/lib.h>
 #include <xen/ctype.h>
 #include <xen/sched.h>
+#include <xen/sched-if.h>
 #include <xen/smp.h>
 #include <xen/delay.h>
 #include <xen/event.h>
@@ -706,13 +707,13 @@ int __init construct_dom0(
         shared_info(d, vcpu_info[i].evtchn_upcall_mask) = 1;
 
     if ( opt_dom0_max_vcpus == 0 )
-        opt_dom0_max_vcpus = num_online_cpus();
+        opt_dom0_max_vcpus = num_cpupool_cpus(cpupool0);
     if ( opt_dom0_max_vcpus > MAX_VIRT_CPUS )
         opt_dom0_max_vcpus = MAX_VIRT_CPUS;
     printk("Dom0 has maximum %u VCPUs\n", opt_dom0_max_vcpus);
 
     for ( i = 1; i < opt_dom0_max_vcpus; i++ )
-        (void)alloc_vcpu(d, i, i % num_online_cpus());
+        (void)alloc_vcpu(d, i, i % num_cpupool_cpus(cpupool0));
 
     /* Set up CR3 value for write_ptbase */
     if ( paging_mode_enabled(d) )
diff -r 655dc3bc1d8e xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/arch/x86/mm.c Thu Apr 09 12:00:02 2009 +0200
@@ -212,7 +212,7 @@ void __init arch_init_memory(void)
      * Any Xen-heap pages that we will allow to be mapped will have
      * their domain field set to dom_xen.
      */
-    dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
+    dom_xen = domain_create(DOMID_XEN, CPUPOOLID_NONE, DOMCRF_dummy, 0);
     BUG_ON(dom_xen == NULL);
 
     /*
@@ -220,7 +220,7 @@ void __init arch_init_memory(void)
      * This domain owns I/O pages that are within the range of the page_info
      * array. Mappings occur at the priv of the caller.
      */
-    dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
+    dom_io = domain_create(DOMID_IO, CPUPOOLID_NONE, DOMCRF_dummy, 0);
     BUG_ON(dom_io == NULL);
 
     /* First 1MB of RAM is historically marked as I/O. */
diff -r 655dc3bc1d8e xen/arch/x86/setup.c
--- a/xen/arch/x86/setup.c      Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/arch/x86/setup.c      Thu Apr 16 08:20:11 2009 +0200
@@ -2,6 +2,7 @@
 #include <xen/init.h>
 #include <xen/lib.h>
 #include <xen/sched.h>
+#include <xen/sched-if.h>
 #include <xen/domain.h>
 #include <xen/serial.h>
 #include <xen/softirq.h>
@@ -232,7 +233,7 @@ static void __init init_idle_domain(void
     /* Domain creation requires that scheduler structures are initialised. */
     scheduler_init();
 
-    idle_domain = domain_create(IDLE_DOMAIN_ID, 0, 0);
+    idle_domain = domain_create(IDLE_DOMAIN_ID, CPUPOOLID_NONE, 0, 0);
     if ( (idle_domain == NULL) || (alloc_vcpu(idle_domain, 0, 0) == NULL) )
         BUG();
 
@@ -995,8 +996,12 @@ void __init __start_xen(unsigned long mb
     if ( !tboot_protect_mem_regions() )
         panic("Could not protect TXT memory regions\n");
 
+    /* Create initial cpupool 0. */
+    cpupool0 = cpupool_create(0, NULL);
+    if ( (cpupool0 == NULL) || cpupool0_cpu_assign(cpupool0) )
+        panic("Error creating cpupool 0\n");
     /* Create initial domain 0. */
-    dom0 = domain_create(0, DOMCRF_s3_integrity, DOM0_SSIDREF);
+    dom0 = domain_create(0, 0, DOMCRF_s3_integrity, DOM0_SSIDREF);
     if ( (dom0 == NULL) || (alloc_vcpu(dom0, 0, 0) == NULL) )
         panic("Error creating domain 0\n");
 
diff -r 655dc3bc1d8e xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c    Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/arch/x86/smpboot.c    Thu Apr 09 12:04:14 2009 +0200
@@ -1265,7 +1265,7 @@ int __cpu_disable(void)
        /* It's now safe to remove this processor from the online map */
        cpu_clear(cpu, cpu_online_map);
 
-       cpu_disable_scheduler();
+       cpu_disable_scheduler(cpu, 0);
 
        return 0;
 }
@@ -1299,7 +1299,7 @@ int cpu_down(unsigned int cpu)
        int err = 0;
 
        spin_lock(&cpu_add_remove_lock);
-       if (num_online_cpus() == 1) {
+       if (cpupool_cpu_remove(cpu)) {
                err = -EBUSY;
                goto out;
        }
@@ -1451,6 +1451,7 @@ int __devinit __cpu_up(unsigned int cpu)
                process_pending_timers();
        }
 
+       cpupool_cpu_add(cpu);
        cpufreq_add_cpu(cpu);
        return 0;
 }
diff -r 655dc3bc1d8e xen/common/Makefile
--- a/xen/common/Makefile       Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/common/Makefile       Thu Apr 09 12:04:41 2009 +0200
@@ -1,4 +1,5 @@ obj-y += bitmap.o
 obj-y += bitmap.o
+obj-y += cpupool.o
 obj-y += domctl.o
 obj-y += domain.o
 obj-y += event_channel.o
diff -r 655dc3bc1d8e xen/common/domain.c
--- a/xen/common/domain.c       Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/common/domain.c       Thu Apr 09 13:45:33 2009 +0200
@@ -187,7 +187,7 @@ struct vcpu *alloc_idle_vcpu(unsigned in
         return v;
 
     d = (vcpu_id == 0) ?
-        domain_create(IDLE_DOMAIN_ID, 0, 0) :
+        domain_create(IDLE_DOMAIN_ID, CPUPOOLID_NONE, 0, 0) :
         idle_vcpu[cpu_id - vcpu_id]->domain;
     BUG_ON(d == NULL);
 
@@ -198,7 +198,7 @@ struct vcpu *alloc_idle_vcpu(unsigned in
 }
 
 struct domain *domain_create(
-    domid_t domid, unsigned int domcr_flags, ssidref_t ssidref)
+    domid_t domid, int poolid, unsigned int domcr_flags, ssidref_t ssidref)
 {
     struct domain *d, **pd;
     enum { INIT_xsm = 1u<<0, INIT_rangeset = 1u<<1, INIT_evtchn = 1u<<2,
@@ -259,6 +259,9 @@ struct domain *domain_create(
     d->iomem_caps = rangeset_new(d, "I/O Memory", RANGESETF_prettyprint_hex);
     d->irq_caps   = rangeset_new(d, "Interrupts", 0);
     if ( (d->iomem_caps == NULL) || (d->irq_caps == NULL) )
+        goto fail;
+
+    if ( cpupool_add_domain(d, poolid) != 0 )
         goto fail;
 
     if ( sched_init_domain(d) != 0 )
@@ -564,6 +567,8 @@ static void complete_domain_destroy(stru
 
     sched_destroy_domain(d);
 
+    cpupool_rm_domain(d);
+
     /* Free page used by xen oprofile buffer. */
     free_xenoprof_pages(d);
 
diff -r 655dc3bc1d8e xen/common/domctl.c
--- a/xen/common/domctl.c       Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/common/domctl.c       Thu Apr 16 08:20:11 2009 +0200
@@ -11,6 +11,7 @@
 #include <xen/lib.h>
 #include <xen/mm.h>
 #include <xen/sched.h>
+#include <xen/sched-if.h>
 #include <xen/domain.h>
 #include <xen/event.h>
 #include <xen/domain_page.h>
@@ -138,15 +139,18 @@ void getdomaininfo(struct domain *d, str
     info->max_pages         = d->max_pages;
     info->shared_info_frame = mfn_to_gmfn(d, __pa(d->shared_info)>>PAGE_SHIFT);
 
+    info->cpupool = d->cpupool ? d->cpupool->cpupool_id : CPUPOOLID_NONE;
+
     memcpy(info->handle, d->handle, sizeof(xen_domain_handle_t));
 }
 
-static unsigned int default_vcpu0_location(void)
+static unsigned int default_vcpu0_location(struct domain *dom)
 {
     struct domain *d;
     struct vcpu   *v;
     unsigned int   i, cpu, nr_cpus, *cnt;
     cpumask_t      cpu_exclude_map;
+    cpumask_t      online;
 
     /* Do an initial CPU placement. Pick the least-populated CPU. */
     nr_cpus = last_cpu(cpu_possible_map) + 1;
@@ -171,7 +175,8 @@ static unsigned int default_vcpu0_locati
     if ( cpus_weight(cpu_sibling_map[0]) > 1 )
         cpu = next_cpu(cpu, cpu_sibling_map[0]);
     cpu_exclude_map = cpu_sibling_map[0];
-    for_each_online_cpu ( i )
+    online = (dom->cpupool == NULL) ? cpu_online_map : dom->cpupool->cpu_valid;
+    for_each_cpu_mask(i, online)
     {
         if ( cpu_isset(i, cpu_exclude_map) )
             continue;
@@ -366,12 +371,13 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
         domid_t        dom;
         static domid_t rover = 0;
         unsigned int domcr_flags;
+        int            pool = 0;
 
         ret = -EINVAL;
         if ( supervisor_mode_kernel ||
              (op->u.createdomain.flags &
              ~(XEN_DOMCTL_CDF_hvm_guest | XEN_DOMCTL_CDF_hap |
-               XEN_DOMCTL_CDF_s3_integrity)) )
+               XEN_DOMCTL_CDF_s3_integrity | XEN_DOMCTL_CDF_pool)) )
             break;
 
         dom = op->domain;
@@ -405,9 +411,11 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
             domcr_flags |= DOMCRF_hap;
         if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_s3_integrity )
             domcr_flags |= DOMCRF_s3_integrity;
+        if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_pool )
+            pool = op->u.createdomain.cpupool;
 
         ret = -ENOMEM;
-        d = domain_create(dom, domcr_flags, op->u.createdomain.ssidref);
+        d = domain_create(dom, pool, domcr_flags, op->u.createdomain.ssidref);
         if ( d == NULL )
             break;
 
@@ -426,6 +434,7 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
     {
         struct domain *d;
         unsigned int i, max = op->u.max_vcpus.max, cpu;
+        cpumask_t online;
 
         ret = -ESRCH;
         if ( (d = rcu_lock_domain_by_id(op->domain)) == NULL )
@@ -455,14 +464,15 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
             goto maxvcpu_out;
 
         ret = -ENOMEM;
+        online = (d->cpupool == NULL) ? cpu_online_map : d->cpupool->cpu_valid;
         for ( i = 0; i < max; i++ )
         {
             if ( d->vcpu[i] != NULL )
                 continue;
 
             cpu = (i == 0) ?
-                default_vcpu0_location() :
-                cycle_cpu(d->vcpu[i-1]->processor, cpu_online_map);
+                default_vcpu0_location(d) :
+                cycle_cpu(d->vcpu[i-1]->processor, online);
 
             if ( alloc_vcpu(d, i, cpu) == NULL )
                 goto maxvcpu_out;
@@ -890,6 +900,14 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
     }
     break;
 
+    case XEN_DOMCTL_cpupool_op:
+    {
+        ret = cpupool_do_domctl(op);
+        if ( (ret == 0) && copy_to_guest(u_domctl, op, 1) )
+            ret = -EFAULT;
+    }
+    break;
+
     default:
         ret = arch_do_domctl(op, u_domctl);
         break;
diff -r 655dc3bc1d8e xen/common/sched_credit.c
--- a/xen/common/sched_credit.c Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/common/sched_credit.c Thu Apr 16 09:41:15 2009 +0200
@@ -69,11 +69,15 @@
 /*
  * Useful macros
  */
+#define CSCHED_PRIV(_ops)   \
+    ((struct csched_private *)((_ops)->sched_data))
 #define CSCHED_PCPU(_c)     \
     ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv)
 #define CSCHED_VCPU(_vcpu)  ((struct csched_vcpu *) (_vcpu)->sched_priv)
 #define CSCHED_DOM(_dom)    ((struct csched_dom *) (_dom)->sched_priv)
 #define RUNQ(_cpu)          (&(CSCHED_PCPU(_cpu)->runq))
+#define CSCHED_CPUONLINE(_pool)    \
+    (((_pool) == NULL) ? cpupool_free_cpus : (_pool)->cpu_valid)
 
 
 /*
@@ -157,10 +161,12 @@ struct csched_private {
     struct timer  master_ticker;
     unsigned int master;
     cpumask_t idlers;
+    cpumask_t cpus;
     uint32_t weight;
     uint32_t credit;
     int credit_balance;
     uint32_t runq_sort;
+    int ticker_active;
 };
 
 
@@ -168,8 +174,10 @@ struct csched_private {
  * Global variables
  */
 static struct csched_private csched_priv;
+static struct csched_private *csched_priv0 = NULL;
 
 static void csched_tick(void *_cpu);
+static void csched_acct(void *dummy);
 
 static inline int
 __vcpu_on_runq(struct csched_vcpu *svc)
@@ -214,6 +222,7 @@ __runq_tickle(unsigned int cpu, struct c
 {
     struct csched_vcpu * const cur =
         CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
+    struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu));
     cpumask_t mask;
 
     ASSERT(cur);
@@ -240,14 +249,14 @@ __runq_tickle(unsigned int cpu, struct c
      */
     if ( cur->pri > CSCHED_PRI_IDLE )
     {
-        if ( cpus_empty(csched_priv.idlers) )
+        if ( cpus_empty(prv->idlers) )
         {
             CSCHED_STAT_CRANK(tickle_idlers_none);
         }
         else
         {
             CSCHED_STAT_CRANK(tickle_idlers_some);
-            cpus_or(mask, mask, csched_priv.idlers);
+            cpus_or(mask, mask, prv->idlers);
             cpus_and(mask, mask, new->vcpu->cpu_affinity);
         }
     }
@@ -257,38 +266,78 @@ __runq_tickle(unsigned int cpu, struct c
         cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
 }
 
-static int
-csched_pcpu_init(int cpu)
+static void
+csched_free_pdata(struct scheduler *ops, void *pcpu, int cpu)
+{
+    struct csched_private *prv = CSCHED_PRIV(ops);
+    struct csched_pcpu *spc = pcpu;
+    unsigned long flags;
+
+    if ( spc == NULL )
+        return;
+
+    spin_lock_irqsave(&prv->lock, flags);
+
+    prv->credit -= CSCHED_CREDITS_PER_ACCT;
+    prv->ncpus--;
+    cpu_clear(cpu, prv->idlers);
+    cpu_clear(cpu, prv->cpus);
+    if ( (prv->master == cpu) && (prv->ncpus > 0) )
+    {
+        prv->master = first_cpu(prv->cpus);
+        migrate_timer(&prv->master_ticker, prv->master);
+    }
+    kill_timer(&spc->ticker);
+    if ( prv->ncpus == 0 )
+        kill_timer(&prv->master_ticker);
+
+    spin_unlock_irqrestore(&prv->lock, flags);
+
+    xfree(spc);
+}
+
+static void *
+csched_alloc_pdata(struct scheduler *ops, int cpu)
 {
     struct csched_pcpu *spc;
+    struct csched_private *prv = CSCHED_PRIV(ops);
     unsigned long flags;
 
     /* Allocate per-PCPU info */
     spc = xmalloc(struct csched_pcpu);
     if ( spc == NULL )
-        return -1;
-
-    spin_lock_irqsave(&csched_priv.lock, flags);
+        return NULL;
+
+    spin_lock_irqsave(&prv->lock, flags);
 
     /* Initialize/update system-wide config */
-    csched_priv.credit += CSCHED_CREDITS_PER_ACCT;
-    if ( csched_priv.ncpus <= cpu )
-        csched_priv.ncpus = cpu + 1;
-    if ( csched_priv.master >= csched_priv.ncpus )
-        csched_priv.master = cpu;
+    prv->credit += CSCHED_CREDITS_PER_ACCT;
+    prv->ncpus++;
+    cpu_set(cpu, prv->cpus);
+    if ( (prv->ncpus == 1) && (prv != csched_priv0) )
+    {
+        prv->master = cpu;
+        init_timer( &prv->master_ticker, csched_acct, prv, cpu);
+        prv->ticker_active = 2;
+    }
 
     init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu);
+
+    if ( prv == csched_priv0 )
+        prv->master = first_cpu(prv->cpus);
+
     INIT_LIST_HEAD(&spc->runq);
-    spc->runq_sort_last = csched_priv.runq_sort;
-    per_cpu(schedule_data, cpu).sched_priv = spc;
+    spc->runq_sort_last = prv->runq_sort;
+    if ( per_cpu(schedule_data, cpu).sched_priv == NULL )
+        per_cpu(schedule_data, cpu).sched_priv = spc;
 
     /* Start off idling... */
     BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr));
-    cpu_set(cpu, csched_priv.idlers);
-
-    spin_unlock_irqrestore(&csched_priv.lock, flags);
-
-    return 0;
+    cpu_set(cpu, prv->idlers);
+
+    spin_unlock_irqrestore(&prv->lock, flags);
+
+    return spc;
 }
 
 #ifndef NDEBUG
@@ -361,17 +410,19 @@ __csched_vcpu_is_migrateable(struct vcpu
 }
 
 static int
-csched_cpu_pick(struct vcpu *vc)
+csched_cpu_pick(struct scheduler *ops, struct vcpu *vc)
 {
     cpumask_t cpus;
     cpumask_t idlers;
+    cpumask_t online;
     int cpu;
 
     /*
      * Pick from online CPUs in VCPU's affinity mask, giving a
      * preference to its current processor if it's in there.
      */
-    cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
+    online = CSCHED_CPUONLINE(vc->domain->cpupool);
+    cpus_and(cpus, online, vc->cpu_affinity);
     cpu = cpu_isset(vc->processor, cpus)
             ? vc->processor
             : cycle_cpu(vc->processor, cpus);
@@ -389,7 +440,7 @@ csched_cpu_pick(struct vcpu *vc)
      * like run two VCPUs on co-hyperthreads while there are idle cores
      * or sockets.
      */
-    idlers = csched_priv.idlers;
+    idlers = CSCHED_PRIV(ops)->idlers;
     cpu_set(cpu, idlers);
     cpus_and(cpus, cpus, idlers);
     cpu_clear(cpu, cpus);
@@ -433,12 +484,12 @@ csched_cpu_pick(struct vcpu *vc)
 }
 
 static inline void
-__csched_vcpu_acct_start(struct csched_vcpu *svc)
+__csched_vcpu_acct_start(struct csched_private *prv, struct csched_vcpu *svc)
 {
     struct csched_dom * const sdom = svc->sdom;
     unsigned long flags;
 
-    spin_lock_irqsave(&csched_priv.lock, flags);
+    spin_lock_irqsave(&(prv->lock), flags);
 
     if ( list_empty(&svc->active_vcpu_elem) )
     {
@@ -449,16 +500,17 @@ __csched_vcpu_acct_start(struct csched_v
         list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
         if ( list_empty(&sdom->active_sdom_elem) )
         {
-            list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
-            csched_priv.weight += sdom->weight;
-        }
-    }
-
-    spin_unlock_irqrestore(&csched_priv.lock, flags);
+            list_add(&sdom->active_sdom_elem, &(prv->active_sdom));
+            prv->weight += sdom->weight;
+        }
+    }
+
+    spin_unlock_irqrestore(&(prv->lock), flags);
 }
 
 static inline void
-__csched_vcpu_acct_stop_locked(struct csched_vcpu *svc)
+__csched_vcpu_acct_stop_locked(struct csched_private *prv,
+    struct csched_vcpu *svc)
 {
     struct csched_dom * const sdom = svc->sdom;
 
@@ -471,16 +523,17 @@ __csched_vcpu_acct_stop_locked(struct cs
     list_del_init(&svc->active_vcpu_elem);
     if ( list_empty(&sdom->active_vcpu) )
     {
-        BUG_ON( csched_priv.weight < sdom->weight );
+        BUG_ON( prv->weight < sdom->weight );
         list_del_init(&sdom->active_sdom_elem);
-        csched_priv.weight -= sdom->weight;
+        prv->weight -= sdom->weight;
     }
 }
 
 static void
-csched_vcpu_acct(unsigned int cpu)
+csched_vcpu_acct(struct csched_private *prv, unsigned int cpu)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(current);
+    struct scheduler *ops = per_cpu(scheduler, cpu);
 
     ASSERT( current->processor == cpu );
     ASSERT( svc->sdom != NULL );
@@ -508,9 +561,9 @@ csched_vcpu_acct(unsigned int cpu)
      */
     if ( list_empty(&svc->active_vcpu_elem) )
     {
-        __csched_vcpu_acct_start(svc);
-    }
-    else if ( csched_cpu_pick(current) != cpu )
+        __csched_vcpu_acct_start(prv, svc);
+    }
+    else if ( csched_cpu_pick(ops, current) != cpu )
     {
         CSCHED_VCPU_STAT_CRANK(svc, migrate_r);
         CSCHED_STAT_CRANK(migrate_running);
@@ -519,34 +572,54 @@ csched_vcpu_acct(unsigned int cpu)
     }
 }
 
-static int
-csched_vcpu_init(struct vcpu *vc)
-{
-    struct domain * const dom = vc->domain;
-    struct csched_dom *sdom = CSCHED_DOM(dom);
+static void *
+csched_alloc_vdata(struct scheduler *ops, struct vcpu *vc)
+{
     struct csched_vcpu *svc;
-
-    CSCHED_STAT_CRANK(vcpu_init);
 
     /* Allocate per-VCPU info */
     svc = xmalloc(struct csched_vcpu);
     if ( svc == NULL )
-        return -1;
+        return NULL;
 
     INIT_LIST_HEAD(&svc->runq_elem);
     INIT_LIST_HEAD(&svc->active_vcpu_elem);
-    svc->sdom = sdom;
+    svc->sdom = CSCHED_DOM(vc->domain);
     svc->vcpu = vc;
     atomic_set(&svc->credit, 0);
     svc->flags = 0U;
-    svc->pri = is_idle_domain(dom) ? CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
+    svc->pri = is_idle_domain(vc->domain) ?
+        CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
     CSCHED_VCPU_STATS_RESET(svc);
+    return svc;
+}
+
+static void
+csched_vcpu_insert(struct scheduler *ops, struct vcpu *vc)
+{
+    struct csched_vcpu *svc = vc->sched_priv;
+
+    if ( !__vcpu_on_runq(svc) && vcpu_runnable(vc) && !vc->is_running )
+        __runq_insert(vc->processor, svc);
+}
+
+static int
+csched_vcpu_init(struct scheduler *ops, struct vcpu *vc)
+{
+    struct csched_vcpu *svc;
+
+    CSCHED_STAT_CRANK(vcpu_init);
+
+    svc = csched_alloc_vdata(ops, vc);
+    if ( svc == NULL )
+        return -1;
+
     vc->sched_priv = svc;
 
     /* Allocate per-PCPU info */
     if ( unlikely(!CSCHED_PCPU(vc->processor)) )
     {
-        if ( csched_pcpu_init(vc->processor) != 0 )
+        if ( csched_alloc_pdata(ops, vc->processor) == NULL )
             return -1;
     }
 
@@ -555,29 +628,41 @@ csched_vcpu_init(struct vcpu *vc)
 }
 
 static void
-csched_vcpu_destroy(struct vcpu *vc)
+csched_free_vdata(struct scheduler *ops, void *priv)
+{
+    struct csched_private *prv = CSCHED_PRIV(ops);
+    struct csched_vcpu *svc = priv;
+    unsigned long flags;
+
+    if ( __vcpu_on_runq(svc) )
+        __runq_remove(svc);
+
+    spin_lock_irqsave(&(prv->lock), flags);
+
+    if ( !list_empty(&svc->active_vcpu_elem) )
+        __csched_vcpu_acct_stop_locked(prv, svc);
+
+    spin_unlock_irqrestore(&(prv->lock), flags);
+
+    xfree(svc);
+}
+
+static void
+csched_vcpu_destroy(struct scheduler *ops, struct vcpu *vc)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(vc);
     struct csched_dom * const sdom = svc->sdom;
-    unsigned long flags;
 
     CSCHED_STAT_CRANK(vcpu_destroy);
 
     BUG_ON( sdom == NULL );
     BUG_ON( !list_empty(&svc->runq_elem) );
 
-    spin_lock_irqsave(&csched_priv.lock, flags);
-
-    if ( !list_empty(&svc->active_vcpu_elem) )
-        __csched_vcpu_acct_stop_locked(svc);
-
-    spin_unlock_irqrestore(&csched_priv.lock, flags);
-
-    xfree(svc);
+    csched_free_vdata(ops, svc);
 }
 
 static void
-csched_vcpu_sleep(struct vcpu *vc)
+csched_vcpu_sleep(struct scheduler *ops, struct vcpu *vc)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(vc);
 
@@ -592,7 +677,7 @@ csched_vcpu_sleep(struct vcpu *vc)
 }
 
 static void
-csched_vcpu_wake(struct vcpu *vc)
+csched_vcpu_wake(struct scheduler *ops, struct vcpu *vc)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(vc);
     const unsigned int cpu = vc->processor;
@@ -648,10 +733,11 @@ csched_vcpu_wake(struct vcpu *vc)
 
 static int
 csched_dom_cntl(
-    struct domain *d,
+    struct scheduler *ops, struct domain *d,
     struct xen_domctl_scheduler_op *op)
 {
     struct csched_dom * const sdom = CSCHED_DOM(d);
+    struct csched_private *prv = CSCHED_PRIV(ops);
     unsigned long flags;
 
     if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
@@ -663,14 +749,14 @@ csched_dom_cntl(
     {
         ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo);
 
-        spin_lock_irqsave(&csched_priv.lock, flags);
+        spin_lock_irqsave(&(prv->lock), flags);
 
         if ( op->u.credit.weight != 0 )
         {
             if ( !list_empty(&sdom->active_sdom_elem) )
             {
-                csched_priv.weight -= sdom->weight;
-                csched_priv.weight += op->u.credit.weight;
+                prv->weight -= sdom->weight;
+                prv->weight += op->u.credit.weight;
             }
             sdom->weight = op->u.credit.weight;
         }
@@ -678,14 +764,14 @@ csched_dom_cntl(
         if ( op->u.credit.cap != (uint16_t)~0U )
             sdom->cap = op->u.credit.cap;
 
-        spin_unlock_irqrestore(&csched_priv.lock, flags);
+        spin_unlock_irqrestore(&(prv->lock), flags);
     }
 
     return 0;
 }
 
 static int
-csched_dom_init(struct domain *dom)
+csched_dom_init(struct scheduler *ops, struct domain *dom)
 {
     struct csched_dom *sdom;
 
@@ -711,7 +797,7 @@ csched_dom_init(struct domain *dom)
 }
 
 static void
-csched_dom_destroy(struct domain *dom)
+csched_dom_destroy(struct scheduler *ops, struct domain *dom)
 {
     CSCHED_STAT_CRANK(dom_destroy);
     xfree(CSCHED_DOM(dom));
@@ -725,7 +811,7 @@ csched_dom_destroy(struct domain *dom)
  * remember the last UNDER to make the move up operation O(1).
  */
 static void
-csched_runq_sort(unsigned int cpu)
+csched_runq_sort(struct csched_private *prv, unsigned int cpu)
 {
     struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
     struct list_head *runq, *elem, *next, *last_under;
@@ -733,7 +819,7 @@ csched_runq_sort(unsigned int cpu)
     unsigned long flags;
     int sort_epoch;
 
-    sort_epoch = csched_priv.runq_sort;
+    sort_epoch = prv->runq_sort;
     if ( sort_epoch == spc->runq_sort_last )
         return;
 
@@ -768,8 +854,9 @@ csched_runq_sort(unsigned int cpu)
 }
 
 static void
-csched_acct(void* dummy)
-{
+csched_acct(void *dummy)
+{
+    struct csched_private *prv = dummy;
     unsigned long flags;
     struct list_head *iter_vcpu, *next_vcpu;
     struct list_head *iter_sdom, *next_sdom;
@@ -786,22 +873,22 @@ csched_acct(void* dummy)
     int credit;
 
 
-    spin_lock_irqsave(&csched_priv.lock, flags);
-
-    weight_total = csched_priv.weight;
-    credit_total = csched_priv.credit;
+    spin_lock_irqsave(&(prv->lock), flags);
+
+    weight_total = prv->weight;
+    credit_total = prv->credit;
 
     /* Converge balance towards 0 when it drops negative */
-    if ( csched_priv.credit_balance < 0 )
-    {
-        credit_total -= csched_priv.credit_balance;
+    if ( prv->credit_balance < 0 )
+    {
+        credit_total -= prv->credit_balance;
         CSCHED_STAT_CRANK(acct_balance);
     }
 
     if ( unlikely(weight_total == 0) )
     {
-        csched_priv.credit_balance = 0;
-        spin_unlock_irqrestore(&csched_priv.lock, flags);
+        prv->credit_balance = 0;
+        spin_unlock_irqrestore(&(prv->lock), flags);
         CSCHED_STAT_CRANK(acct_no_work);
         goto out;
     }
@@ -813,7 +900,7 @@ csched_acct(void* dummy)
     credit_xtra = 0;
     credit_cap = 0U;
 
-    list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom )
+    list_for_each_safe( iter_sdom, next_sdom, &(prv->active_sdom) )
     {
         sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
 
@@ -833,9 +920,9 @@ csched_acct(void* dummy)
          * only when the system-wide credit balance is negative.
          */
         credit_peak = sdom->active_vcpu_count * CSCHED_CREDITS_PER_ACCT;
-        if ( csched_priv.credit_balance < 0 )
-        {
-            credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) +
+        if ( prv->credit_balance < 0 )
+        {
+            credit_peak += ( ( -prv->credit_balance * sdom->weight) +
                              (weight_total - 1)
                            ) / weight_total;
         }
@@ -877,7 +964,7 @@ csched_acct(void* dummy)
                  */
                 CSCHED_STAT_CRANK(acct_reorder);
                 list_del(&sdom->active_sdom_elem);
-                list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
+                list_add(&sdom->active_sdom_elem, &(prv->active_sdom));
             }
 
             credit_fair = credit_peak;
@@ -943,7 +1030,7 @@ csched_acct(void* dummy)
                 /* Upper bound on credits means VCPU stops earning */
                 if ( credit > CSCHED_CREDITS_PER_TSLICE )
                 {
-                    __csched_vcpu_acct_stop_locked(svc);
+                    __csched_vcpu_acct_stop_locked(prv, svc);
                     credit = 0;
                     atomic_set(&svc->credit, credit);
                 }
@@ -955,15 +1042,15 @@ csched_acct(void* dummy)
         }
     }
 
-    csched_priv.credit_balance = credit_balance;
-
-    spin_unlock_irqrestore(&csched_priv.lock, flags);
+    prv->credit_balance = credit_balance;
+
+    spin_unlock_irqrestore(&(prv->lock), flags);
 
     /* Inform each CPU that its runq needs to be sorted */
-    csched_priv.runq_sort++;
+    prv->runq_sort++;
 
 out:
-    set_timer( &csched_priv.master_ticker, NOW() +
+    set_timer( &(prv->master_ticker), NOW() +
             MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT );
 }
 
@@ -972,6 +1059,7 @@ csched_tick(void *_cpu)
 {
     unsigned int cpu = (unsigned long)_cpu;
     struct csched_pcpu *spc = CSCHED_PCPU(cpu);
+    struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu));
 
     spc->tick++;
 
@@ -979,7 +1067,7 @@ csched_tick(void *_cpu)
      * Accounting for running VCPU
      */
     if ( !is_idle_vcpu(current) )
-        csched_vcpu_acct(cpu);
+        csched_vcpu_acct(prv, cpu);
 
     /*
      * Check if runq needs to be sorted
@@ -988,7 +1076,7 @@ csched_tick(void *_cpu)
      * modified priorities. This is a special O(n) sort and runs at most
      * once per accounting period (currently 30 milliseconds).
      */
-    csched_runq_sort(cpu);
+    csched_runq_sort(prv, cpu);
 
     set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
 }
@@ -1040,10 +1128,12 @@ csched_runq_steal(int peer_cpu, int cpu,
 }
 
 static struct csched_vcpu *
-csched_load_balance(int cpu, struct csched_vcpu *snext)
+csched_load_balance(struct csched_private *prv, int cpu,
+    struct csched_vcpu *snext)
 {
     struct csched_vcpu *speer;
     cpumask_t workers;
+    cpumask_t online;
     int peer_cpu;
 
     BUG_ON( cpu != snext->vcpu->processor );
@@ -1063,7 +1153,8 @@ csched_load_balance(int cpu, struct csch
      * Peek at non-idling CPUs in the system, starting with our
      * immediate neighbour.
      */
-    cpus_andnot(workers, cpu_online_map, csched_priv.idlers);
+    online = CSCHED_CPUONLINE(per_cpu(cpupool, cpu));
+    cpus_andnot(workers, online, prv->idlers);
     cpu_clear(cpu, workers);
     peer_cpu = cpu;
 
@@ -1105,16 +1196,39 @@ csched_load_balance(int cpu, struct csch
  * fast for the common case.
  */
 static struct task_slice
-csched_schedule(s_time_t now)
+csched_schedule(struct scheduler *ops, s_time_t now)
 {
     const int cpu = smp_processor_id();
     struct list_head * const runq = RUNQ(cpu);
     struct csched_vcpu * const scurr = CSCHED_VCPU(current);
+    struct csched_private *prv = CSCHED_PRIV(ops);
     struct csched_vcpu *snext;
     struct task_slice ret;
 
     CSCHED_STAT_CRANK(schedule);
     CSCHED_VCPU_CHECK(current);
+
+    if ( unlikely(!cpu_isset(cpu, CSCHED_CPUONLINE(per_cpu(cpupool, cpu)))) )
+    {
+        struct list_head * iter;
+
+        snext = scurr;
+        if (is_idle_vcpu(current))
+            goto out;
+
+        if ( vcpu_runnable(current) )
+            __runq_insert(cpu, scurr);
+
+        list_for_each(iter, runq)
+        {
+            snext = __runq_elem(iter);
+            if ( snext->pri == CSCHED_PRI_IDLE )
+                break;
+        }
+        BUG_ON( snext->pri != CSCHED_PRI_IDLE );
+        __runq_remove(snext);
+        goto out;
+    }
 
     /*
      * Select next runnable local VCPU (ie top of local runq)
@@ -1137,20 +1251,21 @@ csched_schedule(s_time_t now)
     if ( snext->pri > CSCHED_PRI_TS_OVER )
         __runq_remove(snext);
     else
-        snext = csched_load_balance(cpu, snext);
-
+        snext = csched_load_balance(prv, cpu, snext);
+
+out:
     /*
      * Update idlers mask if necessary. When we're idling, other CPUs
      * will tickle us when they get extra work.
      */
     if ( snext->pri == CSCHED_PRI_IDLE )
     {
-        if ( !cpu_isset(cpu, csched_priv.idlers) )
-            cpu_set(cpu, csched_priv.idlers);
-    }
-    else if ( cpu_isset(cpu, csched_priv.idlers) )
-    {
-        cpu_clear(cpu, csched_priv.idlers);
+        if ( !cpu_isset(cpu, prv->idlers) )
+            cpu_set(cpu, prv->idlers);
+    }
+    else if ( cpu_isset(cpu, prv->idlers) )
+    {
+        cpu_clear(cpu, prv->idlers);
     }
 
     /*
@@ -1194,7 +1309,7 @@ csched_dump_vcpu(struct csched_vcpu *svc
 }
 
 static void
-csched_dump_pcpu(int cpu)
+csched_dump_pcpu(struct scheduler *ops, int cpu)
 {
     struct list_head *runq, *iter;
     struct csched_pcpu *spc;
@@ -1231,9 +1346,10 @@ csched_dump_pcpu(int cpu)
 }
 
 static void
-csched_dump(void)
+csched_dump(struct scheduler *ops)
 {
     struct list_head *iter_sdom, *iter_svc;
+    struct csched_private *prv = CSCHED_PRIV(ops);
     int loop;
     char idlers_buf[100];
 
@@ -1250,12 +1366,12 @@ csched_dump(void)
            "\tticks per tslice   = %d\n"
            "\tticks per acct     = %d\n"
            "\tmigration delay    = %uus\n",
-           csched_priv.ncpus,
-           csched_priv.master,
-           csched_priv.credit,
-           csched_priv.credit_balance,
-           csched_priv.weight,
-           csched_priv.runq_sort,
+           prv->ncpus,
+           prv->master,
+           prv->credit,
+           prv->credit_balance,
+           prv->weight,
+           prv->runq_sort,
            CSCHED_DEFAULT_WEIGHT,
            CSCHED_MSECS_PER_TICK,
            CSCHED_CREDITS_PER_TICK,
@@ -1263,12 +1379,12 @@ csched_dump(void)
            CSCHED_TICKS_PER_ACCT,
            vcpu_migration_delay);
 
-    cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), csched_priv.idlers);
+    cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), prv->idlers);
     printk("idlers: %s\n", idlers_buf);
 
     printk("active vcpus:\n");
     loop = 0;
-    list_for_each( iter_sdom, &csched_priv.active_sdom )
+    list_for_each( iter_sdom, &(prv->active_sdom) )
     {
         struct csched_dom *sdom;
         sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
@@ -1284,18 +1400,29 @@ csched_dump(void)
     }
 }
 
-static void
-csched_init(void)
-{
-    spin_lock_init(&csched_priv.lock);
-    INIT_LIST_HEAD(&csched_priv.active_sdom);
-    csched_priv.ncpus = 0;
-    csched_priv.master = UINT_MAX;
-    cpus_clear(csched_priv.idlers);
-    csched_priv.weight = 0U;
-    csched_priv.credit = 0U;
-    csched_priv.credit_balance = 0;
-    csched_priv.runq_sort = 0U;
+static int
+csched_init(struct scheduler *ops)
+{
+    struct csched_private *prv;
+
+    prv = xmalloc(struct csched_private);
+    if ( prv == NULL )
+        return 1;
+    if (csched_priv0 == NULL)
+        csched_priv0 = prv;
+    ops->sched_data = prv;
+    spin_lock_init(&(prv->lock));
+    INIT_LIST_HEAD(&(prv->active_sdom));
+    prv->ncpus = 0;
+    prv->master = UINT_MAX;
+    cpus_clear(prv->idlers);
+    prv->weight = 0U;
+    prv->credit = 0U;
+    prv->credit_balance = 0;
+    prv->runq_sort = 0U;
+    prv->ticker_active = (csched_priv0 == prv) ? 0 : 1;
+
+    return 0;
 }
 
 /* Tickers cannot be kicked until SMP subsystem is alive. */
@@ -1305,8 +1432,10 @@ static __init int csched_start_tickers(v
     unsigned int cpu;
 
     /* Is the credit scheduler initialised? */
-    if ( csched_priv.ncpus == 0 )
+    if ( (csched_priv0 == NULL) || (csched_priv0->ncpus == 0) )
         return 0;
+
+    csched_priv0->ticker_active = 1;
 
     for_each_online_cpu ( cpu )
     {
@@ -1314,45 +1443,70 @@ static __init int csched_start_tickers(v
         set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
     }
 
-    init_timer( &csched_priv.master_ticker, csched_acct, NULL,
-                    csched_priv.master);
-
-    set_timer( &csched_priv.master_ticker, NOW() +
+    init_timer( &(csched_priv0->master_ticker), csched_acct, csched_priv0,
+                    csched_priv0->master);
+
+    set_timer( &(csched_priv0->master_ticker), NOW() +
             MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT );
 
     return 0;
 }
 __initcall(csched_start_tickers);
 
-static void csched_tick_suspend(void)
+static void
+csched_deinit(struct scheduler *ops)
+{
+    struct csched_private *prv;
+
+    prv = CSCHED_PRIV(ops);
+    if ( prv != NULL )
+        xfree(prv);
+}
+
+static void csched_tick_suspend(struct scheduler *ops, unsigned int cpu)
 {
     struct csched_pcpu *spc;
 
-    spc = CSCHED_PCPU(smp_processor_id());
+    spc = CSCHED_PCPU(cpu);
 
     stop_timer(&spc->ticker);
 }
 
-static void csched_tick_resume(void)
+static void csched_tick_resume(struct scheduler *ops, unsigned int cpu)
 {
     struct csched_pcpu *spc;
     uint64_t now = NOW();
-
-    spc = CSCHED_PCPU(smp_processor_id());
+    struct csched_private *prv;
+
+    prv = CSCHED_PRIV(ops);
+    if ( !prv->ticker_active )
+        return;
+
+    spc = CSCHED_PCPU(cpu);
 
     set_timer(&spc->ticker, now + MILLISECS(CSCHED_MSECS_PER_TICK)
             - now % MILLISECS(CSCHED_MSECS_PER_TICK) );
+
+    if ( (prv->ticker_active == 2) && (prv->master == cpu) )
+    {
+        set_timer( &prv->master_ticker, now +
+            MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT -
+            now % MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT);
+        prv->ticker_active = 1;
+    }
 }
 
 struct scheduler sched_credit_def = {
     .name           = "SMP Credit Scheduler",
     .opt_name       = "credit",
     .sched_id       = XEN_SCHEDULER_CREDIT,
+    .sched_data     = &csched_priv,
 
     .init_domain    = csched_dom_init,
     .destroy_domain = csched_dom_destroy,
 
     .init_vcpu      = csched_vcpu_init,
+    .insert_vcpu    = csched_vcpu_insert,
     .destroy_vcpu   = csched_vcpu_destroy,
 
     .sleep          = csched_vcpu_sleep,
@@ -1366,6 +1520,11 @@ struct scheduler sched_credit_def = {
     .dump_cpu_state = csched_dump_pcpu,
     .dump_settings  = csched_dump,
     .init           = csched_init,
+    .deinit         = csched_deinit,
+    .alloc_vdata    = csched_alloc_vdata,
+    .free_vdata     = csched_free_vdata,
+    .alloc_pdata    = csched_alloc_pdata,
+    .free_pdata     = csched_free_pdata,
 
     .tick_suspend   = csched_tick_suspend,
     .tick_resume    = csched_tick_resume,
diff -r 655dc3bc1d8e xen/common/sched_sedf.c
--- a/xen/common/sched_sedf.c   Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/common/sched_sedf.c   Thu Apr 09 14:54:22 2009 +0200
@@ -20,6 +20,9 @@
         if ( (_f) <= SEDFLEVEL )                \
             printk(_a );                        \
     } while ( 0 )
+
+#define SEDF_CPUONLINE(_pool)                                             \
+    (((_pool) == NULL) ? cpupool_free_cpus : (_pool)->cpu_valid)
 
 #ifndef NDEBUG
 #define SEDF_STATS
@@ -132,7 +135,7 @@ struct sedf_cpu_info {
 #define sedf_runnable(edom)  (!(EDOM_INFO(edom)->status & SEDF_ASLEEP))
 
 
-static void sedf_dump_cpu_state(int i);
+static void sedf_dump_cpu_state(struct scheduler *ops, int i);
 
 static inline int extraq_on(struct vcpu *d, int i)
 {
@@ -329,30 +332,17 @@ static inline void __add_to_runqueue_sor
 }
 
 
-static int sedf_init_vcpu(struct vcpu *v)
+static void *sedf_alloc_vdata(struct scheduler *ops, struct vcpu *v)
 {
     struct sedf_vcpu_info *inf;
 
-    if ( (v->sched_priv = xmalloc(struct sedf_vcpu_info)) == NULL )
-        return -1;
-    memset(v->sched_priv, 0, sizeof(struct sedf_vcpu_info));
-
-    inf = EDOM_INFO(v);
+    inf = xmalloc(struct sedf_vcpu_info);
+    if ( inf == NULL )
+        return NULL;
+
+    memset(inf, 0, sizeof(struct sedf_vcpu_info));
     inf->vcpu = v;
- 
-    /* Allocate per-CPU context if this is the first domain to be added. */
-    if ( unlikely(per_cpu(schedule_data, v->processor).sched_priv == NULL) )
-    {
-        per_cpu(schedule_data, v->processor).sched_priv = 
-            xmalloc(struct sedf_cpu_info);
-        BUG_ON(per_cpu(schedule_data, v->processor).sched_priv == NULL);
-        memset(CPU_INFO(v->processor), 0, sizeof(*CPU_INFO(v->processor)));
-        INIT_LIST_HEAD(WAITQ(v->processor));
-        INIT_LIST_HEAD(RUNQ(v->processor));
-        INIT_LIST_HEAD(EXTRAQ(v->processor,EXTRA_PEN_Q));
-        INIT_LIST_HEAD(EXTRAQ(v->processor,EXTRA_UTIL_Q));
-    }
-       
+
     /* Every VCPU gets an equal share of extratime by default. */
     inf->deadl_abs   = 0;
     inf->latency     = 0;
@@ -383,19 +373,69 @@ static int sedf_init_vcpu(struct vcpu *v
     }
     else
     {
-        EDOM_INFO(v)->deadl_abs = 0;
-        EDOM_INFO(v)->status &= ~SEDF_ASLEEP;
-    }
-
+        inf->deadl_abs = 0;
+        inf->status &= ~SEDF_ASLEEP;
+    }
+
+    return inf;
+}
+
+static void *
+sedf_alloc_pdata(struct scheduler *ops, int cpu)
+{
+    struct sedf_cpu_info *spc;
+
+    spc = xmalloc(struct sedf_cpu_info);
+    BUG_ON(spc == NULL);
+    memset(spc, 0, sizeof(*spc));
+    INIT_LIST_HEAD(&spc->waitq);
+    INIT_LIST_HEAD(&spc->runnableq);
+    INIT_LIST_HEAD(&spc->extraq[EXTRA_PEN_Q]);
+    INIT_LIST_HEAD(&spc->extraq[EXTRA_UTIL_Q]);
+
+    return (void *)spc;
+}
+
+static void
+sedf_free_pdata(struct scheduler *ops, void *spc, int cpu)
+{
+    if ( spc == NULL )
+        return;
+
+    xfree(spc);
+}
+
+static int sedf_init_vcpu(struct scheduler *ops, struct vcpu *v)
+{
+    struct sedf_vcpu_info *inf;
+
+    /* Allocate per-CPU context if this is the first domain to be added. */
+    if ( unlikely(per_cpu(schedule_data, v->processor).sched_priv == NULL) )
+    {
+        per_cpu(schedule_data, v->processor).sched_priv = 
+            sedf_alloc_pdata(ops, v->processor);
+    }
+       
+    inf = sedf_alloc_vdata(ops, v);
+    if ( inf == NULL )
+        return -1;
+
+    v->sched_priv = inf;
+ 
     return 0;
 }
 
-static void sedf_destroy_vcpu(struct vcpu *v)
-{
-    xfree(v->sched_priv);
-}
-
-static int sedf_init_domain(struct domain *d)
+static void sedf_free_vdata(struct scheduler *ops, void *priv)
+{
+    xfree(priv);
+}
+
+static void sedf_destroy_vcpu(struct scheduler *ops, struct vcpu *v)
+{
+    sedf_free_vdata(ops, v->sched_priv);
+}
+
+static int sedf_init_domain(struct scheduler *ops, struct domain *d)
 {
     d->sched_priv = xmalloc(struct sedf_dom_info);
     if ( d->sched_priv == NULL )
@@ -406,16 +446,18 @@ static int sedf_init_domain(struct domai
     return 0;
 }
 
-static void sedf_destroy_domain(struct domain *d)
+static void sedf_destroy_domain(struct scheduler *ops, struct domain *d)
 {
     xfree(d->sched_priv);
 }
 
-static int sedf_pick_cpu(struct vcpu *v)
+static int sedf_pick_cpu(struct scheduler *ops, struct vcpu *v)
 {
     cpumask_t online_affinity;
-
-    cpus_and(online_affinity, v->cpu_affinity, cpu_online_map);
+    cpumask_t online;
+
+    online = SEDF_CPUONLINE(v->domain->cpupool);
+    cpus_and(online_affinity, v->cpu_affinity, online);
     return first_cpu(online_affinity);
 }
 
@@ -751,7 +793,7 @@ static struct task_slice sedf_do_extra_s
    -timeslice for the current period used up
    -domain on waitqueue has started it's period
    -and various others ;) in general: determine which domain to run next*/
-static struct task_slice sedf_do_schedule(s_time_t now)
+static struct task_slice sedf_do_schedule(struct scheduler *ops, s_time_t now)
 {
     int                   cpu      = smp_processor_id();
     struct list_head     *runq     = RUNQ(cpu);
@@ -786,6 +828,13 @@ static struct task_slice sedf_do_schedul
     }
  check_waitq:
     update_queues(now, runq, waitq);
+
+    if ( unlikely(!cpu_isset(cpu, SEDF_CPUONLINE(per_cpu(cpupool, cpu)))) )
+    {
+        ret.task = IDLETASK(cpu);
+        ret.time = SECONDS(1);
+        goto sched_done;
+    }
  
     /*now simply pick the first domain from the runqueue, which has the
       earliest deadline, because the list is sorted*/
@@ -848,7 +897,7 @@ static struct task_slice sedf_do_schedul
 }
 
 
-static void sedf_sleep(struct vcpu *d)
+static void sedf_sleep(struct scheduler *ops, struct vcpu *d)
 {
     PRINT(2,"sedf_sleep was called, domain-id %i.%i\n",
           d->domain->domain_id, d->vcpu_id);
@@ -1067,7 +1116,7 @@ static inline int should_switch(struct v
     return 1;
 }
 
-static void sedf_wake(struct vcpu *d)
+static void sedf_wake(struct scheduler *ops, struct vcpu *d)
 {
     s_time_t              now = NOW();
     struct sedf_vcpu_info* inf = EDOM_INFO(d);
@@ -1220,8 +1269,8 @@ static void sedf_dump_domain(struct vcpu
 }
 
 
-/* dumps all domains on hte specified cpu */
-static void sedf_dump_cpu_state(int i)
+/* dumps all domains on the specified cpu */
+static void sedf_dump_cpu_state(struct scheduler *ops, int i)
 {
     struct list_head      *list, *queue, *tmp;
     struct sedf_vcpu_info *d_inf;
@@ -1294,7 +1343,7 @@ static void sedf_dump_cpu_state(int i)
 
 
 /* Adjusts periods and slices of the domains accordingly to their weights. */
-static int sedf_adjust_weights(struct xen_domctl_scheduler_op *cmd)
+static int sedf_adjust_weights(struct cpupool *c, struct 
xen_domctl_scheduler_op *cmd)
 {
     struct vcpu *p;
     struct domain      *d;
@@ -1315,6 +1364,8 @@ static int sedf_adjust_weights(struct xe
     rcu_read_lock(&domlist_read_lock);
     for_each_domain( d )
     {
+        if ( c != d->cpupool )
+           continue;
         for_each_vcpu( d, p )
         {
             if ( EDOM_INFO(p)->weight )
@@ -1366,7 +1417,7 @@ static int sedf_adjust_weights(struct xe
 
 
 /* set or fetch domain scheduling parameters */
-static int sedf_adjust(struct domain *p, struct xen_domctl_scheduler_op *op)
+static int sedf_adjust(struct scheduler *ops, struct domain *p, struct 
xen_domctl_scheduler_op *op)
 {
     struct vcpu *v;
     int rc;
@@ -1425,7 +1476,7 @@ static int sedf_adjust(struct domain *p,
             }
         }
 
-        rc = sedf_adjust_weights(op);
+        rc = sedf_adjust_weights(p->cpupool, op);
         if ( rc )
             return rc;
 
@@ -1463,6 +1514,11 @@ struct scheduler sched_sedf_def = {
 
     .init_vcpu      = sedf_init_vcpu,
     .destroy_vcpu   = sedf_destroy_vcpu,
+
+    .alloc_vdata    = sedf_alloc_vdata,
+    .free_vdata     = sedf_free_vdata,
+    .alloc_pdata    = sedf_alloc_pdata,
+    .free_pdata     = sedf_free_pdata,
 
     .do_schedule    = sedf_do_schedule,
     .pick_cpu       = sedf_pick_cpu,
diff -r 655dc3bc1d8e xen/common/schedule.c
--- a/xen/common/schedule.c     Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/common/schedule.c     Thu Apr 16 09:18:40 2009 +0200
@@ -55,6 +55,7 @@ static void poll_timer_fn(void *data);
 
 /* This is global for now so that private implementations can reach it */
 DEFINE_PER_CPU(struct schedule_data, schedule_data);
+DEFINE_PER_CPU(struct scheduler *, scheduler);
 
 extern struct scheduler sched_sedf_def;
 extern struct scheduler sched_credit_def;
@@ -66,9 +67,15 @@ static struct scheduler *schedulers[] = 
 
 static struct scheduler ops;
 
-#define SCHED_OP(fn, ...)                                 \
-         (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ )      \
-          : (typeof(ops.fn(__VA_ARGS__)))0 )
+#define SCHED_OP(opsptr, fn, ...)                                          \
+         (( (opsptr)->fn != NULL ) ? (opsptr)->fn(opsptr, ##__VA_ARGS__ )  \
+          : (typeof((opsptr)->fn(opsptr, ##__VA_ARGS__)))0 )
+
+#define DOM2OP(_d)    (((_d)->cpupool == NULL) ? &ops : 
&((_d)->cpupool->sched))
+#define VCPU2OP(_v)   (DOM2OP((_v)->domain))
+#define VCPU2ONLINE(_v)                                                    \
+         (((_v)->domain->cpupool == NULL) ? cpu_online_map                 \
+         : (_v)->domain->cpupool->cpu_valid)
 
 static inline void trace_runstate_change(struct vcpu *v, int new_state)
 {
@@ -182,7 +189,13 @@ int sched_init_vcpu(struct vcpu *v, unsi
 
     TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id);
 
-    return SCHED_OP(init_vcpu, v);
+    if ( SCHED_OP(DOM2OP(d), init_vcpu, v) != 0 )
+        return 1;
+
+    if ( is_idle_domain(d) )
+        per_cpu(schedule_data, v->processor).sched_idlevpriv = v->sched_priv;
+
+    return 0;
 }
 
 void sched_destroy_vcpu(struct vcpu *v)
@@ -190,17 +203,47 @@ void sched_destroy_vcpu(struct vcpu *v)
     kill_timer(&v->periodic_timer);
     kill_timer(&v->singleshot_timer);
     kill_timer(&v->poll_timer);
-    SCHED_OP(destroy_vcpu, v);
+    SCHED_OP(VCPU2OP(v), destroy_vcpu, v);
+}
+
+void sched_move_domain(struct domain *d, struct cpupool *c)
+{
+    struct vcpu *v;
+    unsigned int new_p;
+
+    domain_pause(d);
+
+    new_p = first_cpu(c->cpu_valid);
+    for_each_vcpu ( d, v )
+    {
+        migrate_timer(&v->periodic_timer, new_p);
+        migrate_timer(&v->singleshot_timer, new_p);
+        migrate_timer(&v->poll_timer, new_p);
+
+        SCHED_OP(VCPU2OP(v), destroy_vcpu, v);
+
+        cpus_setall(v->cpu_affinity);
+        v->processor = new_p;
+        SCHED_OP(&(c->sched), init_vcpu, v);
+
+        new_p = next_cpu(new_p, c->cpu_valid);
+        if ( new_p == NR_CPUS )
+            new_p = first_cpu(c->cpu_valid);
+    }
+
+    d->cpupool = c;
+
+    domain_unpause(d);
 }
 
 int sched_init_domain(struct domain *d)
 {
-    return SCHED_OP(init_domain, d);
+    return SCHED_OP(DOM2OP(d), init_domain, d);
 }
 
 void sched_destroy_domain(struct domain *d)
 {
-    SCHED_OP(destroy_domain, d);
+    SCHED_OP(DOM2OP(d), destroy_domain, d);
 }
 
 void vcpu_sleep_nosync(struct vcpu *v)
@@ -214,7 +257,7 @@ void vcpu_sleep_nosync(struct vcpu *v)
         if ( v->runstate.state == RUNSTATE_runnable )
             vcpu_runstate_change(v, RUNSTATE_offline, NOW());
 
-        SCHED_OP(sleep, v);
+        SCHED_OP(VCPU2OP(v), sleep, v);
     }
 
     vcpu_schedule_unlock_irqrestore(v, flags);
@@ -242,7 +285,7 @@ void vcpu_wake(struct vcpu *v)
     {
         if ( v->runstate.state >= RUNSTATE_blocked )
             vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
-        SCHED_OP(wake, v);
+        SCHED_OP(VCPU2OP(v), wake, v);
     }
     else if ( !test_bit(_VPF_blocked, &v->pause_flags) )
     {
@@ -297,7 +340,7 @@ static void vcpu_migrate(struct vcpu *v)
 
     /* Switch to new CPU, then unlock old CPU. */
     old_cpu = v->processor;
-    v->processor = SCHED_OP(pick_cpu, v);
+    v->processor = SCHED_OP(VCPU2OP(v), pick_cpu, v);
     spin_unlock_irqrestore(
         &per_cpu(schedule_data, old_cpu).schedule_lock, flags);
 
@@ -326,22 +369,32 @@ void vcpu_force_reschedule(struct vcpu *
 }
 
 /*
- * This function is used by cpu_hotplug code from stop_machine context.
- * Hence we can avoid needing to take the 
+ * This function is used by cpu_hotplug code from stop_machine context
+ * and from cpupools to switch schedulers on a cpu.
  */
-void cpu_disable_scheduler(void)
+int cpu_disable_scheduler(unsigned int cpu, int lock)
 {
     struct domain *d;
     struct vcpu *v;
-    unsigned int cpu = smp_processor_id();
+    struct cpupool *c;
+    int    ret = 0;
+
+    c = per_cpu(cpupool, cpu);
+    if ( c == NULL )
+        return ret;
 
     for_each_domain ( d )
     {
+        if ( (d->cpupool != c) || c->pool_paused )
+            continue;
+
         for_each_vcpu ( d, v )
         {
             if ( is_idle_vcpu(v) )
                 continue;
 
+            if ( lock != 0 )
+                vcpu_schedule_lock_irq(v);
             if ( (cpus_weight(v->cpu_affinity) == 1) &&
                  cpu_isset(cpu, v->cpu_affinity) )
             {
@@ -351,29 +404,49 @@ void cpu_disable_scheduler(void)
             }
 
             /*
-             * Migrate single-shot timers to CPU0. A new cpu will automatically
-             * be chosen when the timer is next re-set.
+             * Migrate single-shot timers to other cpu of same pool. A new cpu
+             * will automatically be chosen when the timer is next re-set.
              */
             if ( v->singleshot_timer.cpu == cpu )
-                migrate_timer(&v->singleshot_timer, 0);
+            {
+                int cpu_mig;
+
+                cpu_mig = first_cpu(c->cpu_valid);
+                if (cpu_mig == cpu)
+                    cpu_mig = next_cpu(cpu_mig, c->cpu_valid);
+                migrate_timer(&v->singleshot_timer, cpu_mig);
+            }
 
             if ( v->processor == cpu )
             {
                 set_bit(_VPF_migrating, &v->pause_flags);
+                if ( lock != 0 )
+                    vcpu_schedule_unlock_irq(v);
                 vcpu_sleep_nosync(v);
                 vcpu_migrate(v);
             }
+            else if ( lock != 0 )
+                vcpu_schedule_unlock_irq(v);
+            /*
+             * A vcpu active in the hypervisor will not be migratable.
+             * The caller should try again after releasing and reaquiring
+             * all locks.
+             */
+            if ( v->processor == cpu )
+                ret = -EAGAIN;
         }
     }
+    return ret;
 }
 
 static int __vcpu_set_affinity(
     struct vcpu *v, cpumask_t *affinity,
     bool_t old_lock_status, bool_t new_lock_status)
 {
-    cpumask_t online_affinity, old_affinity;
-
-    cpus_and(online_affinity, *affinity, cpu_online_map);
+    cpumask_t online, online_affinity, old_affinity;
+
+    online = VCPU2ONLINE(v);
+    cpus_and(online_affinity, *affinity, online);
     if ( cpus_empty(online_affinity) )
         return -EINVAL;
 
@@ -424,12 +497,13 @@ int vcpu_locked_change_affinity(struct v
 
 void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity)
 {
-    cpumask_t online_affinity;
+    cpumask_t online, online_affinity;
 
     /* Do not fail if no CPU in old affinity mask is online. */
-    cpus_and(online_affinity, *affinity, cpu_online_map);
+    online = VCPU2ONLINE(v);
+    cpus_and(online_affinity, *affinity, online);
     if ( cpus_empty(online_affinity) )
-        *affinity = cpu_online_map;
+        *affinity = VCPU2ONLINE(v);
 
     if ( __vcpu_set_affinity(v, affinity, 1, 0) != 0 )
         BUG();
@@ -721,7 +795,7 @@ long sched_adjust(struct domain *d, stru
     struct vcpu *v;
     long ret;
     
-    if ( (op->sched_id != ops.sched_id) ||
+    if ( (op->sched_id != DOM2OP(d)->sched_id) ||
          ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
           (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) )
         return -EINVAL;
@@ -748,7 +822,7 @@ long sched_adjust(struct domain *d, stru
     if ( d == current->domain )
         vcpu_schedule_lock_irq(current);
 
-    if ( (ret = SCHED_OP(adjust, d, op)) == 0 )
+    if ( (ret = SCHED_OP(DOM2OP(d), adjust, d, op)) == 0 )
         TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
 
     if ( d == current->domain )
@@ -796,6 +870,7 @@ static void schedule(void)
 {
     struct vcpu          *prev = current, *next = NULL;
     s_time_t              now = NOW();
+    struct scheduler     *sched = this_cpu(scheduler);
     struct schedule_data *sd;
     struct task_slice     next_slice;
 
@@ -811,7 +886,7 @@ static void schedule(void)
     stop_timer(&sd->s_timer);
     
     /* get policy-specific decision on scheduling... */
-    next_slice = ops.do_schedule(now);
+    next_slice = sched->do_schedule(sched, now);
 
     next = next_slice.task;
 
@@ -911,18 +986,25 @@ static void poll_timer_fn(void *data)
         vcpu_unblock(v);
 }
 
+/* Get scheduler by id */
+struct scheduler *scheduler_get_by_id(unsigned int id)
+{
+    int i;
+
+    for ( i = 0; schedulers[i] != NULL; i++ )
+    {
+        if ( schedulers[i]->sched_id == id )
+            return schedulers[i];
+    }
+    return NULL;
+}
+
 /* Initialise the data structures. */
 void __init scheduler_init(void)
 {
     int i;
 
     open_softirq(SCHEDULE_SOFTIRQ, schedule);
-
-    for_each_cpu ( i )
-    {
-        spin_lock_init(&per_cpu(schedule_data, i).schedule_lock);
-        init_timer(&per_cpu(schedule_data, i).s_timer, s_timer_fn, NULL, i);
-    }
 
     for ( i = 0; schedulers[i] != NULL; i++ )
     {
@@ -934,43 +1016,121 @@ void __init scheduler_init(void)
     if ( schedulers[i] == NULL )
         printk("Could not find scheduler: %s\n", opt_sched);
 
+    for_each_cpu ( i )
+    {
+        per_cpu(scheduler, i) = &ops;
+        spin_lock_init(&per_cpu(schedule_data, i).schedule_lock);
+        init_timer(&per_cpu(schedule_data, i).s_timer, s_timer_fn, NULL, i);
+    }
+
     printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
-    SCHED_OP(init);
-}
-
-void dump_runq(unsigned char key)
-{
-    s_time_t      now = NOW();
-    int           i;
+    if ( SCHED_OP(&ops, init) )
+        panic("scheduler returned error on init\n");
+}
+
+/* switch scheduler on cpu */
+void schedule_cpu_switch(unsigned int cpu, struct cpupool *c)
+{
     unsigned long flags;
-
-    local_irq_save(flags);
-
-    printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
-    SCHED_OP(dump_settings);
-    printk("sched_smt_power_savings: %s\n",
-            sched_smt_power_savings? "enabled":"disabled");
-    printk("NOW=0x%08X%08X\n",  (u32)(now>>32), (u32)now);
-
-    for_each_online_cpu ( i )
+    struct vcpu *v;
+    void *vpriv = NULL;
+    void *ppriv;
+    void *ppriv_old;
+    struct scheduler *old_ops;
+    struct scheduler *new_ops;
+
+    old_ops = per_cpu(scheduler, cpu);
+    new_ops = (c == NULL) ? &ops : &(c->sched);
+    v = per_cpu(schedule_data, cpu).idle;
+    ppriv = SCHED_OP(new_ops, alloc_pdata, cpu);
+    if ( c != NULL )
+        vpriv = SCHED_OP(new_ops, alloc_vdata, v);
+
+    spin_lock_irqsave(&per_cpu(schedule_data, cpu).schedule_lock, flags);
+
+    if ( c == NULL )
+    {
+        vpriv = v->sched_priv;
+        v->sched_priv = per_cpu(schedule_data, cpu).sched_idlevpriv;
+    }
+    else
+    {
+        v->sched_priv = vpriv;
+        vpriv = NULL;
+    }
+    SCHED_OP(old_ops, tick_suspend, cpu);
+    per_cpu(scheduler, cpu) = new_ops;
+    ppriv_old = per_cpu(schedule_data, cpu).sched_priv;
+    per_cpu(schedule_data, cpu).sched_priv = ppriv;
+    SCHED_OP(new_ops, tick_resume, cpu);
+    SCHED_OP(new_ops, insert_vcpu, v);
+
+    spin_unlock_irqrestore(&per_cpu(schedule_data, cpu).schedule_lock, flags);
+
+    if ( vpriv != NULL )
+        SCHED_OP(old_ops, free_vdata, vpriv);
+    SCHED_OP(old_ops, free_pdata, ppriv_old, cpu);
+}
+
+/* init scheduler global data */
+int schedule_init_global(char *name, struct scheduler *sched)
+{
+    int i;
+    struct scheduler *data;
+
+    data = &ops;
+    for ( i = 0; (schedulers[i] != NULL) && (name != NULL) ; i++ )
+    {
+        if ( strcmp(schedulers[i]->opt_name, name) == 0 )
+        {
+            data = schedulers[i];
+            break;
+        }
+    }
+    memcpy(sched, data, sizeof(*sched));
+    return SCHED_OP(sched, init);
+}
+
+/* deinitialize scheduler global data */
+void schedule_deinit_global(struct scheduler *sched)
+{
+    SCHED_OP(sched, deinit);
+}
+
+void schedule_dump(struct cpupool *c)
+{
+    int               i;
+    struct scheduler *sched;
+    cpumask_t         cpus;
+
+    sched = (c == NULL) ? &ops : &(c->sched);
+    cpus = (c == NULL) ? cpupool_free_cpus : c->cpu_valid;
+    printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name);
+    SCHED_OP(sched, dump_settings);
+
+    for_each_cpu_mask (i, cpus)
     {
         spin_lock(&per_cpu(schedule_data, i).schedule_lock);
         printk("CPU[%02d] ", i);
-        SCHED_OP(dump_cpu_state, i);
+        SCHED_OP(sched, dump_cpu_state, i);
         spin_unlock(&per_cpu(schedule_data, i).schedule_lock);
     }
-
-    local_irq_restore(flags);
-}
-
-void sched_tick_suspend(void)
-{
-    SCHED_OP(tick_suspend);
-}
-
-void sched_tick_resume(void)
-{
-    SCHED_OP(tick_resume);
+}
+
+void sched_tick_suspend(unsigned int cpu)
+{
+    struct scheduler *sched;
+
+    sched = per_cpu(scheduler, cpu);
+    SCHED_OP(sched, tick_suspend, cpu);
+}
+
+void sched_tick_resume(unsigned int cpu)
+{
+    struct scheduler *sched;
+
+    sched = per_cpu(scheduler, cpu);
+    SCHED_OP(sched, tick_resume, cpu);
 }
 
 #ifdef CONFIG_COMPAT
diff -r 655dc3bc1d8e xen/include/public/domctl.h
--- a/xen/include/public/domctl.h       Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/include/public/domctl.h       Thu Apr 09 11:47:18 2009 +0200
@@ -59,7 +59,11 @@ struct xen_domctl_createdomain {
  /* Should domain memory integrity be verifed by tboot during Sx? */
 #define _XEN_DOMCTL_CDF_s3_integrity  2
 #define XEN_DOMCTL_CDF_s3_integrity   (1U<<_XEN_DOMCTL_CDF_s3_integrity)
+ /* cpupool is specified (0 otherwise) */
+#define _XEN_DOMCTL_CDF_pool          3
+#define XEN_DOMCTL_CDF_pool           (1U<<_XEN_DOMCTL_CDF_pool)
     uint32_t flags;
+    uint32_t cpupool;
 };
 typedef struct xen_domctl_createdomain xen_domctl_createdomain_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_createdomain_t);
@@ -109,6 +113,7 @@ struct xen_domctl_getdomaininfo {
     uint32_t max_vcpu_id;        /* Maximum VCPUID in use by this domain. */
     uint32_t ssidref;
     xen_domain_handle_t handle;
+    uint32_t cpupool;
 };
 typedef struct xen_domctl_getdomaininfo xen_domctl_getdomaininfo_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_getdomaininfo_t);
@@ -645,6 +650,30 @@ typedef struct xen_domctl_hvmcontext_par
     XEN_GUEST_HANDLE_64(uint8) buffer;  /* OUT: buffer to write record into */
 } xen_domctl_hvmcontext_partial_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_partial_t);
+
+/*
+ * Move domain to specified cpupool.
+ */
+#define XEN_DOMCTL_cpupool_op  56
+#define XEN_DOMCTL_CPUPOOL_OP_CREATE                1  /* C */
+#define XEN_DOMCTL_CPUPOOL_OP_DESTROY               2  /* D */
+#define XEN_DOMCTL_CPUPOOL_OP_INFO                  3  /* I */
+#define XEN_DOMCTL_CPUPOOL_OP_ADDCPU                4  /* A */
+#define XEN_DOMCTL_CPUPOOL_OP_RMCPU                 5  /* R */
+#define XEN_DOMCTL_CPUPOOL_OP_MOVEDOMAIN            6  /* M */
+#define XEN_DOMCTL_CPUPOOL_OP_FREEINFO              7  /* F */
+#define XEN_DOMCTL_CPUPOOL_PAR_ANY     0xFFFFFFFF
+struct xen_domctl_cpupool_op {
+    uint32_t op;          /* IN */
+    uint32_t cpupool_id;  /* IN: CDIARM OUT: CI */
+    uint32_t sched_id;    /* IN: C      OUT: I  */
+    uint32_t domid;       /* IN: M              */
+    uint32_t cpu;         /* IN: AR             */
+    uint32_t n_dom;       /*            OUT: I  */
+    struct xenctl_cpumap cpumap; /*     OUT: IF */
+};
+typedef struct xen_domctl_cpupool_op xen_domctl_cpupool_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_cpupool_op_t);
 
 
 struct xen_domctl {
@@ -688,6 +717,7 @@ struct xen_domctl {
         struct xen_domctl_set_target        set_target;
         struct xen_domctl_subscribe         subscribe;
         struct xen_domctl_debug_op          debug_op;
+        struct xen_domctl_cpupool_op        cpupool_op;
 #if defined(__i386__) || defined(__x86_64__)
         struct xen_domctl_cpuid             cpuid;
 #endif
diff -r 655dc3bc1d8e xen/include/xen/sched-if.h
--- a/xen/include/xen/sched-if.h        Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/include/xen/sched-if.h        Thu Apr 16 09:16:18 2009 +0200
@@ -10,15 +10,24 @@
 
 #include <xen/percpu.h>
 
+/* A global pointer to the initial cpupool (POOL0). */
+extern struct cpupool *cpupool0;
+
+/* cpus currently in no cpupool */
+extern cpumask_t cpupool_free_cpus;
+
 struct schedule_data {
     spinlock_t          schedule_lock;  /* spinlock protecting curr        */
     struct vcpu        *curr;           /* current task                    */
     struct vcpu        *idle;           /* idle task for this cpu          */
     void               *sched_priv;
+    void               *sched_idlevpriv; /* default scheduler vcpu data     */
     struct timer        s_timer;        /* scheduling timer                */
 } __cacheline_aligned;
 
 DECLARE_PER_CPU(struct schedule_data, schedule_data);
+DECLARE_PER_CPU(struct scheduler *, scheduler);
+DECLARE_PER_CPU(struct cpupool *, cpupool);
 
 static inline void vcpu_schedule_lock(struct vcpu *v)
 {
@@ -58,28 +67,50 @@ struct scheduler {
     char *name;             /* full name for this scheduler      */
     char *opt_name;         /* option name for this scheduler    */
     unsigned int sched_id;  /* ID for this scheduler             */
+    void *sched_data;       /* global data pointer               */
 
-    void         (*init)           (void);
+    int          (*init)           (struct scheduler *);
+    void         (*deinit)         (struct scheduler *);
 
-    int          (*init_domain)    (struct domain *);
-    void         (*destroy_domain) (struct domain *);
+    void         (*free_vdata)     (struct scheduler *, void *);
+    void *       (*alloc_vdata)    (struct scheduler *, struct vcpu *);
+    void         (*free_pdata)     (struct scheduler *, void *, int);
+    void *       (*alloc_pdata)    (struct scheduler *, int);
 
-    int          (*init_vcpu)      (struct vcpu *);
-    void         (*destroy_vcpu)   (struct vcpu *);
+    int          (*init_domain)    (struct scheduler *, struct domain *);
+    void         (*destroy_domain) (struct scheduler *, struct domain *);
 
-    void         (*sleep)          (struct vcpu *);
-    void         (*wake)           (struct vcpu *);
+    int          (*init_vcpu)      (struct scheduler *, struct vcpu *);
+    void         (*insert_vcpu)    (struct scheduler *, struct vcpu *);
+    void         (*destroy_vcpu)   (struct scheduler *, struct vcpu *);
 
-    struct task_slice (*do_schedule) (s_time_t);
+    void         (*sleep)          (struct scheduler *, struct vcpu *);
+    void         (*wake)           (struct scheduler *, struct vcpu *);
 
-    int          (*pick_cpu)       (struct vcpu *);
-    int          (*adjust)         (struct domain *,
+    struct task_slice (*do_schedule) (struct scheduler *, s_time_t);
+
+    int          (*pick_cpu)       (struct scheduler *, struct vcpu *);
+    int          (*adjust)         (struct scheduler *, struct domain *,
                                     struct xen_domctl_scheduler_op *);
-    void         (*dump_settings)  (void);
-    void         (*dump_cpu_state) (int);
+    void         (*dump_settings)  (struct scheduler *);
+    void         (*dump_cpu_state) (struct scheduler *, int);
 
-    void         (*tick_suspend)    (void);
-    void         (*tick_resume)     (void);
+    void         (*tick_suspend)   (struct scheduler *, unsigned int);
+    void         (*tick_resume)    (struct scheduler *, unsigned int);
 };
 
+struct cpupool
+{
+    int              cpupool_id;
+    cpumask_t        cpu_valid;      /* all cpus assigned to pool */
+    cpumask_t        cpus_borrowed;  /* cpus borrowed or lent */
+    struct cpupool   *next;
+    unsigned int     n_dom;
+    int              cpu_in_transit; /* used for adding/removing cpus */
+    bool_t           pool_paused;
+    struct scheduler sched;
+};
+
+struct scheduler *scheduler_get_by_id(unsigned int id);
+
 #endif /* __XEN_SCHED_IF_H__ */
diff -r 655dc3bc1d8e xen/include/xen/sched.h
--- a/xen/include/xen/sched.h   Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/include/xen/sched.h   Thu Apr 16 09:14:00 2009 +0200
@@ -182,6 +182,7 @@ struct domain
 
     /* Scheduling. */
     void            *sched_priv;    /* scheduler-specific data */
+    struct cpupool  *cpupool;
 
     struct domain   *next_in_list;
     struct domain   *next_in_hashbucket;
@@ -341,7 +342,7 @@ static inline struct domain *get_current
 }
 
 struct domain *domain_create(
-    domid_t domid, unsigned int domcr_flags, ssidref_t ssidref);
+    domid_t domid, int poolid, unsigned int domcr_flags, ssidref_t ssidref);
  /* DOMCRF_hvm: Create an HVM domain, as opposed to a PV domain. */
 #define _DOMCRF_hvm           0
 #define DOMCRF_hvm            (1U<<_DOMCRF_hvm)
@@ -426,10 +427,11 @@ void sched_destroy_vcpu(struct vcpu *v);
 void sched_destroy_vcpu(struct vcpu *v);
 int  sched_init_domain(struct domain *d);
 void sched_destroy_domain(struct domain *d);
+void sched_move_domain(struct domain *d, struct cpupool *c);
 long sched_adjust(struct domain *, struct xen_domctl_scheduler_op *);
 int  sched_id(void);
-void sched_tick_suspend(void);
-void sched_tick_resume(void);
+void sched_tick_suspend(unsigned int cpu);
+void sched_tick_resume(unsigned int cpu);
 void vcpu_wake(struct vcpu *d);
 void vcpu_sleep_nosync(struct vcpu *d);
 void vcpu_sleep_sync(struct vcpu *d);
@@ -533,8 +535,13 @@ void domain_unpause_by_systemcontroller(
 void domain_unpause_by_systemcontroller(struct domain *d);
 void cpu_init(void);
 
+struct scheduler;
+
+int schedule_init_global(char *name, struct scheduler *sched);
+void schedule_deinit_global(struct scheduler *sched);
+void schedule_cpu_switch(unsigned int cpu, struct cpupool *c);
 void vcpu_force_reschedule(struct vcpu *v);
-void cpu_disable_scheduler(void);
+int cpu_disable_scheduler(unsigned int cpu, int lock);
 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity);
 int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity);
 int vcpu_locked_change_affinity(struct vcpu *v, cpumask_t *affinity);
@@ -560,6 +567,21 @@ extern enum cpufreq_controller {
 extern enum cpufreq_controller {
     FREQCTL_none, FREQCTL_dom0_kernel, FREQCTL_xen
 } cpufreq_controller;
+
+#define CPUPOOLID_NONE    -1
+
+struct cpupool *cpupool_create(int poolid, char *sched);
+int cpupool_destroy(struct cpupool *c);
+int cpupool0_cpu_assign(struct cpupool *c);
+int cpupool_assign_ncpu(struct cpupool *c, int ncpu);
+void cpupool_cpu_add(unsigned int cpu);
+int cpupool_cpu_remove(unsigned int cpu);
+int cpupool_borrow_cpu(struct cpupool *c, unsigned int cpu);
+int cpupool_return_cpu(struct cpupool *c);
+int cpupool_add_domain(struct domain *d, int poolid);
+void cpupool_rm_domain(struct domain *d);
+int cpupool_do_domctl(struct xen_domctl *op);
+#define num_cpupool_cpus(c) (cpus_weight((c)->cpu_valid))
 
 #endif /* __SCHED_H__ */
 
diff -r 655dc3bc1d8e xen/common/cpupool.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/common/cpupool.c      Fri Apr 17 11:01:51 2009 +0200
@@ -0,0 +1,698 @@
+/******************************************************************************
+ * cpupool.c
+ * 
+ * Generic cpupool-handling functions.
+ *
+ * (C) 2009, Juergen Gross, Fujitsu Technology Solutions
+ */
+
+#include <xen/lib.h>
+#include <xen/init.h>
+#include <xen/cpumask.h>
+#include <xen/percpu.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+
+/* #define PRINTD(args...)    printk(args) */
+
+#define for_each_cpupool(ptr)    \
+    for ((ptr) = &cpupool_list; *(ptr) != NULL; (ptr) = &((*(ptr))->next))
+
+struct cpupool *cpupool0;
+cpumask_t cpupool_free_cpus;
+cpumask_t cpupool_free_cpus_borrowed;
+
+static struct cpupool *cpupool_list;     /* linked list, sorted by poolid */
+
+static int cpupool0_max_cpus;
+integer_param("pool0_max_cpus", cpupool0_max_cpus);
+
+static DEFINE_SPINLOCK(cpupool_lock);
+
+DEFINE_PER_CPU(struct cpupool *, cpupool);
+
+static struct cpupool *alloc_cpupool_struct(void)
+{
+    return xmalloc(struct cpupool);
+}
+
+static void free_cpupool_struct(struct cpupool *c)
+{
+    xfree(c);
+}
+
+/*
+ * find a cpupool by it's id. to be called with cpupool lock held,
+ * returns NULL if not found.
+ */
+static struct cpupool *cpupool_find_by_id(int id, int exact)
+{
+    struct cpupool **q;
+
+    for_each_cpupool(q)
+    {
+        if ( (*q)->cpupool_id == id )
+            return *q;
+        if ( (*q)->cpupool_id > id )
+            break;
+    }
+    return exact ? NULL : *q;
+}
+
+/*
+ * create a new cpupool with specified poolid
+ * returns pointer to new cpupool structure if okay, NULL else
+ * possible failures:
+ * - no memory
+ * - poolid already used
+ * - unknown scheduler
+ */
+struct cpupool *cpupool_create(int poolid, char *sched)
+{
+    struct cpupool *c;
+    struct cpupool **q;
+    int last = 0;
+
+    if ( (c = alloc_cpupool_struct()) == NULL )
+        return NULL;
+    memset(c, 0, sizeof(*c));
+
+    PRINTD("cpupool_create(%d,%s)\n", poolid, sched);
+    spin_lock(&cpupool_lock);
+    for_each_cpupool(q)
+    {
+        last = (*q)->cpupool_id;
+        if ( (poolid != CPUPOOLID_NONE) && (last >= poolid) )
+            break;
+    }
+    if ( *q != NULL )
+    {
+        if ( (*q)->cpupool_id == poolid )
+        {
+            spin_unlock(&cpupool_lock);
+            free_cpupool_struct(c);
+            return NULL;
+        }
+        c->next = *q;
+    }
+    *q = c;
+    c->cpupool_id = (poolid == CPUPOOLID_NONE) ? (last + 1) : poolid;
+    c->cpu_in_transit = -1;
+    if ( schedule_init_global(sched, &(c->sched)) )
+    {
+        spin_unlock(&cpupool_lock);
+        cpupool_destroy(c);
+        return NULL;
+    }
+    spin_unlock(&cpupool_lock);
+
+    printk("Created cpupool %d with scheduler %s (%s)\n", c->cpupool_id,
+        c->sched.name, c->sched.opt_name);
+
+    return c;
+}
+
+/*
+ * destroys the given cpupool
+ * returns 0 on success, 1 else
+ * possible failures:
+ * - pool still in use
+ * - cpus still assigned to pool
+ * - pool not in list
+ */
+int cpupool_destroy(struct cpupool *c)
+{
+    struct cpupool **q;
+
+    spin_lock(&cpupool_lock);
+    for_each_cpupool(q)
+        if ( *q == c )
+            break;
+    if ( (*q != c) || (c->n_dom != 0) || cpus_weight(c->cpu_valid) )
+    {
+        spin_unlock(&cpupool_lock);
+        return 1;
+    }
+    *q = c->next;
+    spin_unlock(&cpupool_lock);
+    PRINTD("cpupool_destroy(%d)\n", c->cpupool_id);
+    schedule_deinit_global(&(c->sched));
+    free_cpupool_struct(c);
+    return 0;
+}
+
+/*
+ * assign a specific cpu to a cpupool
+ */
+static void cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu)
+{
+    PRINTD("cpupool_assign_cpu(%d,%d)\n", c->cpupool_id, cpu);
+    per_cpu(cpupool, cpu) = c;
+    schedule_cpu_switch(cpu, c);
+    cpu_clear(cpu, cpupool_free_cpus);
+    cpu_set(cpu, c->cpu_valid);
+    PRINTD("cpupool_assign_cpu(%d,%d) ready\n", c->cpupool_id, cpu);
+}
+
+/*
+ * assign free physical cpus to a cpupool
+ * cpus assigned are unused cpus with lowest possible ids
+ * returns the number of cpus assigned
+ */
+int cpupool_assign_ncpu(struct cpupool *c, int ncpu)
+{
+    int i;
+    int n;
+
+    n = 0;
+    spin_lock(&cpupool_lock);
+    for_each_cpu_mask(i, cpupool_free_cpus)
+    {
+        cpupool_assign_cpu_locked(c, i);
+        n++;
+        if ( n == ncpu )
+            break;
+    }
+    spin_unlock(&cpupool_lock);
+    PRINTD("cpupool_assign_ncpu(%d,%d) rc %d\n", c->cpupool_id, ncpu, n);
+    return n;
+}
+
+static void cpupool_unassign_cpu_locked_1(struct cpupool *c, unsigned int cpu)
+{
+    PRINTD("cpupool_unassign_cpu(%d,%d)\n", c->cpupool_id, cpu);
+    c->cpu_in_transit = cpu;
+}
+
+static int cpupool_unassign_cpu_locked_2(struct cpupool *c)
+{
+    uint64_t to = NOW() + MILLISECS(100);
+    int cpu = c->cpu_in_transit;
+    int ret;
+
+    cpu_clear(cpu, c->cpu_valid);
+    while ( ((ret = cpu_disable_scheduler(cpu, 1)) != 0) && (NOW() < to) );
+    if ( ret )
+    {
+        cpu_set(cpu, c->cpu_valid);
+        c->cpu_in_transit = -1;
+    }
+    else
+    {
+        c->cpu_in_transit = -1;
+        cpu_set(cpu, cpupool_free_cpus);
+        schedule_cpu_switch(cpu, NULL);
+        per_cpu(cpupool, cpu) = NULL;
+    }
+    PRINTD("cpupool_unassign_cpu(%d,%d) ret %d\n", c->cpupool_id, cpu, ret);
+    return ret;
+}
+
+static long cpupool_unassign_cpu_helper(void *info)
+{
+    struct cpupool *c = (struct cpupool *)info;
+    long ret;
+
+    ret = cpupool_unassign_cpu_locked_2(c);
+    spin_unlock(&cpupool_lock);
+    return ret;
+}
+
+static int cpupool_unassign_cpu_locked(struct cpupool *c, unsigned int cpu)
+{
+    cpupool_unassign_cpu_locked_1(c, cpu);
+    return cpupool_unassign_cpu_locked_2(c);
+}
+
+/*
+ * unassign a specific cpu from a cpupool
+ * possible failures:
+ * - last cpu and still domains in cpupool
+ */
+int cpupool_unassign_cpu(struct cpupool *c, unsigned int cpu)
+{
+    int work_cpu;
+
+    spin_lock(&cpupool_lock);
+    if ( !cpu_isset(cpu, c->cpu_valid) )
+    {
+        spin_unlock(&cpupool_lock);
+        return 0;
+    }
+    if ( (c->n_dom > 0) && (cpus_weight(c->cpu_valid) == 1) )
+    {
+        spin_unlock(&cpupool_lock);
+        return -EBUSY;
+    }
+    cpupool_unassign_cpu_locked_1(c, cpu);
+    work_cpu = smp_processor_id();
+    if ( work_cpu == cpu )
+    {
+        work_cpu = first_cpu(cpupool0->cpu_valid);
+        if ( work_cpu == cpu )
+            work_cpu = next_cpu(cpu, cpupool0->cpu_valid);
+    }
+    return continue_hypercall_on_cpu(work_cpu, cpupool_unassign_cpu_helper, c);
+}
+
+/*
+ * borrow cpu from another cpupool
+ * cpu might be free or already in the correct pool
+ * if cpu is taken from other pool, all domains in this pool will be paused
+ * rc == 0 if not borrowed, 1 if borrowed
+ */
+int cpupool_borrow_cpu(struct cpupool *c, unsigned int cpu)
+{
+    struct cpupool **q;
+    struct domain *d;
+
+    if ( cpu_isset(cpu, c->cpu_valid) )
+        return 0;
+
+    spin_lock(&cpupool_lock);
+
+    if ( cpu_isset(cpu, cpupool_free_cpus) )
+    {
+        cpupool_assign_cpu_locked(c, cpu);
+        cpu_set(cpu, c->cpus_borrowed);
+        cpu_set(cpu, cpupool_free_cpus_borrowed);
+        spin_unlock(&cpupool_lock);
+        return 1;
+    }
+
+    for_each_cpupool(q)
+    {
+        if ( cpu_isset(cpu, (*q)->cpu_valid) )
+            break;
+    }
+    BUG_ON(*q == NULL);
+    if ( (*q)->pool_paused++ == 0 )
+    {
+        for_each_domain(d)
+        {
+            if ( d->cpupool == *q )
+                domain_pause(d);
+        }
+    }
+    /* unassigning cpu can't fail as all domains in pool should be paused */
+    cpupool_unassign_cpu_locked(*q, cpu);
+    cpupool_assign_cpu_locked(c, cpu);
+    cpu_set(cpu, c->cpus_borrowed);
+    cpu_set(cpu, (*q)->cpus_borrowed);
+
+    spin_unlock(&cpupool_lock);
+    return 1;
+}
+
+/*
+ * return cpu after borrowing it before
+ * a cpu borrowed via cpupool_borrow_cpu before is returned to its former
+ * pool
+ * returns a cpu to continue on, -1 if all okay
+ */
+int cpupool_return_cpu(struct cpupool *c)
+{
+    int cpu = -1;
+    cpumask_t mask;
+    struct cpupool **q;
+    struct domain *d;
+
+    spin_lock(&cpupool_lock);
+    if ( cpus_weight(c->cpus_borrowed) == 0 )
+        goto out;
+
+    if ( cpu_isset(smp_processor_id(), c->cpus_borrowed) )
+    {
+        cpus_andnot(mask, c->cpu_valid, c->cpus_borrowed);
+        cpu = first_cpu(mask);
+        BUG_ON(cpu == NR_CPUS);
+        goto out;
+    }
+
+    for_each_cpu_mask(cpu, c->cpus_borrowed)
+    {
+        BUG_ON(!cpu_isset(cpu, c->cpu_valid));
+        if ( cpu_isset(cpu, cpupool_free_cpus_borrowed) )
+        {
+            cpu_clear(cpu, cpupool_free_cpus_borrowed);
+            cpu_clear(cpu, c->cpus_borrowed);
+            if ( !cpupool_unassign_cpu_locked(c, cpu) )
+                continue;
+            /* could not move all vcpus, try again */
+            cpu_set(cpu, cpupool_free_cpus_borrowed);
+            cpu_set(cpu, c->cpus_borrowed);
+            goto out;
+        }
+        for_each_cpupool(q)
+        {
+            if ( (*q != c) && cpu_isset(cpu, (*q)->cpus_borrowed) )
+                break;
+        }
+        BUG_ON(*q == NULL);
+        BUG_ON(!(*q)->pool_paused);
+        cpu_clear(cpu, (*q)->cpus_borrowed);
+        cpu_clear(cpu, c->cpus_borrowed);
+        if ( cpupool_unassign_cpu_locked(c, cpu) )
+        {
+            cpu_set(cpu, (*q)->cpus_borrowed);
+            cpu_set(cpu, c->cpus_borrowed);
+            goto out;
+        }
+        cpupool_assign_cpu_locked(*q, cpu);
+        if ( (*q)->pool_paused == 1 )
+        {
+            for_each_domain(d)
+            {
+                if ( d->cpupool == *q )
+                    domain_unpause(d);
+            }
+        }
+        (*q)->pool_paused--;
+    }
+    cpu = -1;
+
+out:
+    spin_unlock(&cpupool_lock);
+    return cpu;
+}
+
+/*
+ * assign cpus to the default cpupool
+ * default are all cpus, less cpus may be specified as boot parameter
+ * possible failures:
+ * - no cpu assigned
+ */
+int __init cpupool0_cpu_assign(struct cpupool *c)
+{
+    if ( (cpupool0_max_cpus == 0) || (cpupool0_max_cpus > num_online_cpus()) )
+        cpupool0_max_cpus = num_online_cpus();
+    if ( !cpupool_assign_ncpu(cpupool0, cpupool0_max_cpus) )
+        return 1;
+    return 0;
+}
+
+/*
+ * add a new domain to a cpupool
+ * possible failures:
+ * - pool does not exist
+ * - pool is paused
+ * - no cpu assigned to pool
+ */
+int cpupool_add_domain(struct domain *d, int poolid)
+{
+    struct cpupool *c;
+    int rc = 1;
+
+    if ( poolid == CPUPOOLID_NONE )
+        return 0;
+    spin_lock(&cpupool_lock);
+    c = cpupool_find_by_id(poolid, 1);
+    if ( (c != NULL) && !c->pool_paused && cpus_weight(c->cpu_valid) )
+    {
+        c->n_dom++;
+        d->cpupool = c;
+        PRINTD("cpupool_add_domain(%d,%d) n_dom %d\n", d->domain_id, poolid,
+            c->n_dom);
+        rc = 0;
+    }
+    spin_unlock(&cpupool_lock);
+    return rc;
+}
+
+/*
+ * remove a domain from a cpupool
+ */
+void cpupool_rm_domain(struct domain *d)
+{
+    if ( d->cpupool == NULL )
+        return;
+    spin_lock(&cpupool_lock);
+    d->cpupool->n_dom--;
+    PRINTD("cpupool_rm_domain(%d,%d) n_dom %d\n", d->domain_id,
+        d->cpupool->cpupool_id, d->cpupool->n_dom);
+    d->cpupool = NULL;
+    spin_unlock(&cpupool_lock);
+    return;
+}
+
+/*
+ * called to add a new cpu to pool admin
+ * we add a hotplugged cpu to the cpupool0 to be able to add it to dom0
+ */
+void cpupool_cpu_add(unsigned int cpu)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+    if ( cpupool0 == NULL )
+        return;
+    spin_lock(&cpupool_lock);
+    cpu_set(cpu, cpupool_free_cpus);
+    cpupool_assign_cpu_locked(cpupool0, cpu);
+    spin_unlock(&cpupool_lock);
+#endif
+    return;
+}
+
+/* called to remove a cpu from pool admin
+ * possible failures:
+ * - cpu is last one in a pool with domains in it
+ * - pool is paused
+ */
+int cpupool_cpu_remove(unsigned int cpu)
+{
+    int rc = 0;
+#ifdef CONFIG_HOTPLUG_CPU
+    struct cpupool **q;
+
+    spin_lock(&cpupool_lock);
+    if ( cpu_isset(cpu, cpupool_free_cpus) )
+    {
+        cpu_clear(cpu, cpupool_free_cpus);
+        goto out;
+    }
+    for_each_cpupool(q)
+        if ( cpu_isset(cpu, (*q)->cpu_valid) )
+            break;
+    if ( *q == NULL )
+        goto out;
+    if ( (((*q)->n_dom == 0) || (cpus_weight((*q)->cpu_valid) > 1)) &&
+         !(*q)->pool_paused )
+    {
+        cpu_clear(cpu, (*q)->cpu_valid);
+        schedule_cpu_switch(cpu, NULL);
+        per_cpu(cpupool, cpu) = NULL;
+    }
+    else
+        rc = 1;
+out:
+    spin_unlock(&cpupool_lock);
+#endif
+    return rc;
+}
+
+/*
+ * do cpupool related domctl operations
+ */
+int cpupool_do_domctl(struct xen_domctl *op)
+{
+    int ret;
+    struct cpupool *c;
+
+    switch ( op->u.cpupool_op.op )
+    {
+
+    case XEN_DOMCTL_CPUPOOL_OP_CREATE:
+    {
+        int poolid;
+        struct scheduler *sched;
+
+        poolid = (op->u.cpupool_op.cpupool_id == XEN_DOMCTL_CPUPOOL_PAR_ANY) ?
+            CPUPOOLID_NONE: op->u.cpupool_op.cpupool_id;
+        sched = scheduler_get_by_id(op->u.cpupool_op.sched_id);
+        ret = -ENOENT;
+        if ( sched == NULL )
+            break;
+        ret = 0;
+        c = cpupool_create(poolid, sched->opt_name);
+        if ( c == NULL )
+            ret = -EINVAL;
+        else
+            op->u.cpupool_op.cpupool_id = c->cpupool_id;
+    }
+    break;
+
+    case XEN_DOMCTL_CPUPOOL_OP_DESTROY:
+    {
+        spin_lock(&cpupool_lock);
+        c = cpupool_find_by_id(op->u.cpupool_op.cpupool_id, 1);
+        spin_unlock(&cpupool_lock);
+       ret = -ENOENT;
+       if ( c == NULL )
+            break;
+        ret = (cpupool_destroy(c) != 0) ? -EBUSY : 0;
+    }
+    break;
+
+    case XEN_DOMCTL_CPUPOOL_OP_INFO:
+    {
+        spin_lock(&cpupool_lock);
+        c = cpupool_find_by_id(op->u.cpupool_op.cpupool_id, 0);
+        spin_unlock(&cpupool_lock);
+       ret = -ENOENT;
+       if ( c == NULL )
+            break;
+        op->u.cpupool_op.cpupool_id = c->cpupool_id;
+        op->u.cpupool_op.sched_id = c->sched.sched_id;
+        op->u.cpupool_op.n_dom = c->n_dom;
+       cpumask_to_xenctl_cpumap(&(op->u.cpupool_op.cpumap), &(c->cpu_valid));
+        ret = 0;
+    }
+    break;
+
+    case XEN_DOMCTL_CPUPOOL_OP_ADDCPU:
+    {
+        unsigned cpu;
+
+        cpu = op->u.cpupool_op.cpu;
+        spin_lock(&cpupool_lock);
+        if ( cpu == XEN_DOMCTL_CPUPOOL_PAR_ANY )
+            cpu = first_cpu(cpupool_free_cpus);
+        ret = -EINVAL;
+        if ( cpu >= NR_CPUS )
+            goto addcpu_out;
+        ret = -EBUSY;
+        if ( !cpu_isset(cpu, cpupool_free_cpus) )
+            goto addcpu_out;
+        c = cpupool_find_by_id(op->u.cpupool_op.cpupool_id, 0);
+       ret = -ENOENT;
+        if ( c == NULL )
+            goto addcpu_out;
+        cpupool_assign_cpu_locked(c, cpu);
+        ret = 0;
+addcpu_out:
+        spin_unlock(&cpupool_lock);
+    }
+    break;
+
+    case XEN_DOMCTL_CPUPOOL_OP_RMCPU:
+    {
+        unsigned cpu;
+
+        spin_lock(&cpupool_lock);
+        c = cpupool_find_by_id(op->u.cpupool_op.cpupool_id, 0);
+        spin_unlock(&cpupool_lock);
+       ret = -ENOENT;
+       if ( c == NULL )
+            break;
+        cpu = op->u.cpupool_op.cpu;
+        if ( cpu == XEN_DOMCTL_CPUPOOL_PAR_ANY )
+            cpu = last_cpu(c->cpu_valid);
+        ret = -EINVAL;
+        if ( cpu >= NR_CPUS )
+            break;
+        /* caution: cpupool_unassign_cpu uses continue_hypercall_on_cpu and
+         * will continue after the local return
+         */
+        ret = cpupool_unassign_cpu(c, cpu);
+    }
+    break;
+
+    case XEN_DOMCTL_CPUPOOL_OP_MOVEDOMAIN:
+    {
+        struct domain *d;
+
+        ret = -EINVAL;
+        if ( op->u.cpupool_op.domid == 0 )
+            break;
+        ret = -ESRCH;
+        d = rcu_lock_domain_by_id(op->u.cpupool_op.domid);
+        if ( d == NULL )
+            break;
+        if ( d->cpupool == NULL )
+        {
+            ret = -EINVAL;
+            rcu_unlock_domain(d);
+            break;
+        }
+        ret = -ENOENT;
+        spin_lock(&cpupool_lock);
+        c = cpupool_find_by_id(op->u.cpupool_op.cpupool_id, 1);
+        if ( (c != NULL) && cpus_weight(c->cpu_valid) && !c->pool_paused )
+        {
+            PRINTD("cpupool move_domain(%d)->%d\n", d->domain_id,
+                c->cpupool_id);
+            d->cpupool->n_dom--;
+            PRINTD("cpupool move_domain(%d), %d.n_dom=%d\n", d->domain_id,
+                d->cpupool->cpupool_id, d->cpupool->n_dom);
+            sched_move_domain(d, c);
+            c->n_dom++;
+            PRINTD("cpupool move_domain(%d), %d.n_dom=%d\n", d->domain_id,
+                c->cpupool_id, c->n_dom);
+            PRINTD("cpupool move_domain(%d)->%d ready\n", d->domain_id,
+                c->cpupool_id);
+            ret = 0;
+        }
+        spin_unlock(&cpupool_lock);
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_CPUPOOL_OP_FREEINFO:
+    {
+        cpumask_to_xenctl_cpumap(&(op->u.cpupool_op.cpumap),
+            &cpupool_free_cpus);
+        ret = 0;
+    }
+    break;
+
+    default:
+        ret = -ENOSYS;
+
+    }
+
+    return ret;
+}
+
+void schedule_dump(struct cpupool *c);
+
+void dump_runq(unsigned char key)
+{
+    unsigned long    flags;
+    s_time_t         now = NOW();
+    struct cpupool **c;
+
+    spin_lock(&cpupool_lock);
+    local_irq_save(flags);
+
+    printk("NOW=0x%08X%08X\n",  (u32)(now>>32), (u32)now);
+
+    printk("Idle cpupool:\n");
+    schedule_dump(NULL);
+
+    for_each_cpupool(c)
+    {
+        printk("Cpupool %d:\n", (*c)->cpupool_id);
+        schedule_dump(*c);
+    }
+
+    local_irq_restore(flags);
+    spin_unlock(&cpupool_lock);
+}
+
+static int __init cpupool_init(void)
+{
+    cpupool_free_cpus = cpu_online_map;
+    cpus_clear(cpupool_free_cpus_borrowed);
+    cpupool_list = NULL;
+    return 0;
+}
+__initcall(cpupool_init);
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-devel] [Patch 1/6] xen: cpupool support - hypervisor support of cpupools, Juergen Gross <=