diff -r 7bd1dd9fb30f xen/common/Makefile --- a/xen/common/Makefile Wed Jan 13 14:15:51 2010 +0000 +++ b/xen/common/Makefile Wed Jan 13 14:36:58 2010 +0000 @@ -13,6 +13,7 @@ obj-y += page_alloc.o obj-y += rangeset.o obj-y += sched_credit.o +obj-y += sched_credit2.o obj-y += sched_sedf.o obj-y += schedule.o obj-y += shutdown.o diff -r 7bd1dd9fb30f xen/common/sched_credit2.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/common/sched_credit2.c Wed Jan 13 14:36:58 2010 +0000 @@ -0,0 +1,1037 @@ + +/**************************************************************************** + * (C) 2009 - George Dunlap - Citrix Systems R&D UK, Ltd + **************************************************************************** + * + * File: common/csched_credit2.c + * Author: George Dunlap + * + * Description: Credit-based SMP CPU scheduler + * Based on an earlier verson by Emmanuel Ackaouy. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if __i386__ +#define PRI_stime "lld" +#else +#define PRI_stime "ld" +#endif + +#define d2printk(x...) +//#define d2printk printk + +#define TRC_CSCHED2_TICK TRC_SCHED_CLASS + 1 +#define TRC_CSCHED2_RUNQ_POS TRC_SCHED_CLASS + 2 +#define TRC_CSCHED2_CREDIT_BURN TRC_SCHED_CLASS + 3 +#define TRC_CSCHED2_CREDIT_ADD TRC_SCHED_CLASS + 4 +#define TRC_CSCHED2_TICKLE_CHECK TRC_SCHED_CLASS + 5 + +/* + * WARNING: This is still in an experimental phase. Status and work can be found at the + * credit2 wiki page: + * http://wiki.xensource.com/xenwiki/Credit2_Scheduler_Development + */ + +/* + * Design: + * + * VMs "burn" credits based on their weight; higher weight means credits burn + * more slowly. + * + * vcpus are inserted into the runqueue by credit order. + * + * Credits are "reset" when the next vcpu in the runqueue is less than or equal to zero. At that + * point, everyone's credits are "clipped" to a small value, and a fixed credit is added to everyone. + * + * The plan is for all cores that share an L2 will share the same runqueue. At the moment, there is + * one global runqueue for all cores. + */ + +/* + * Basic constants + */ +#define CSCHED_DEFAULT_WEIGHT 256 +#define CSCHED_MIN_TIMER MICROSECS(500) +#define CSCHED_CARRYOVER_MAX CSCHED_MIN_TIMER +#define CSCHED_CREDIT_RESET 0 +#define CSCHED_CREDIT_INIT MILLISECS(10) +#define CSCHED_MAX_TIMER MILLISECS(2) + +#define CSCHED_IDLE_CREDIT (-(1<<30)) + +/* + * Flags + */ +/* CSFLAG_scheduled: Is this vcpu either running on, or context-switching off, + * a physical cpu? + * + Accessed only with runqueue lock held + * + Set when chosen as next in csched_schedule(). + * + Cleared after context switch has been saved in csched_context_saved() + * + Checked in vcpu_wake to see if we can add to the runqueue, or if we should + * set CSFLAG_delayed_runq_add + * + Checked to be false in runq_insert. + */ +#define __CSFLAG_scheduled 1 +#define CSFLAG_scheduled (1<<__CSFLAG_scheduled) +/* CSFLAG_delayed_runq_add: Do we need to add this to the runqueue once it'd done + * being context switched out? + * + Set when scheduling out in csched_schedule() if prev is runnable + * + Set in csched_vcpu_wake if it finds CSFLAG_scheduled set + * + Read in csched_context_switched(). If set, it adds prev to the runqueue and + * clears the bit. + */ +#define __CSFLAG_delayed_runq_add 2 +#define CSFLAG_delayed_runq_add (1<<__CSFLAG_delayed_runq_add) + + +/* + * Useful macros + */ +#define CSCHED_PCPU(_c) \ + ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv) +#define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv) +#define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv) +//#define RUNQ(_cpu) (&(CSCHED_GROUP(_cpu)->runq)) +#define RUNQ(_cpu) (&csched_priv.runq) + +/* + * System-wide private data + */ +struct csched_private { + spinlock_t lock; + uint32_t ncpus; + struct domain *idle_domain; + + /* Per-runqueue info */ + struct list_head runq; /* Global runqueue */ + int max_weight; + struct list_head sdom; + struct list_head svc; /* List of all vcpus */ +}; + +struct csched_pcpu { + int _dummy; +}; + +/* + * Virtual CPU + */ +struct csched_vcpu { + struct list_head global_elem; /* On the global vcpu list */ + struct list_head sdom_elem; /* On the domain vcpu list */ + struct list_head runq_elem; /* On the runqueue */ + + /* Up-pointers */ + struct csched_dom *sdom; + struct vcpu *vcpu; + + int weight; + + int credit; + s_time_t start_time; /* When we were scheduled (used for credit) */ + unsigned flags; /* 16 bits doesn't seem to play well with clear_bit() */ + +}; + +/* + * Domain + */ +struct csched_dom { + struct list_head vcpu; + struct list_head sdom_elem; + struct domain *dom; + uint16_t weight; + uint16_t nr_vcpus; +}; + + +/* + * Global variables + */ +static struct csched_private csched_priv; + +/* + * Time-to-credit, credit-to-time. + * FIXME: Do pre-calculated division? + */ +static s_time_t t2c(s_time_t time, struct csched_vcpu *svc) +{ + return time * csched_priv.max_weight / svc->weight; +} + +static s_time_t c2t(s_time_t credit, struct csched_vcpu *svc) +{ + return credit * svc->weight / csched_priv.max_weight; +} + +/* + * Runqueue related code + */ + +static /*inline*/ int +__vcpu_on_runq(struct csched_vcpu *svc) +{ + return !list_empty(&svc->runq_elem); +} + +static /*inline*/ struct csched_vcpu * +__runq_elem(struct list_head *elem) +{ + return list_entry(elem, struct csched_vcpu, runq_elem); +} + +static int +__runq_insert(struct list_head *runq, struct csched_vcpu *svc) +{ + struct list_head *iter; + int pos = 0; + + d2printk("rqi d%dv%d\n", + svc->vcpu->domain->domain_id, + svc->vcpu->vcpu_id); + + /* Idle vcpus not allowed on the runqueue anymore */ + BUG_ON(is_idle_vcpu(svc->vcpu)); + BUG_ON(svc->vcpu->is_running); + BUG_ON(test_bit(__CSFLAG_scheduled, &svc->flags)); + + list_for_each( iter, runq ) + { + struct csched_vcpu * iter_svc = __runq_elem(iter); + + if ( svc->credit > iter_svc->credit ) + { + d2printk(" p%d d%dv%d\n", + pos, + iter_svc->vcpu->domain->domain_id, + iter_svc->vcpu->vcpu_id); + break; + } + pos++; + } + + list_add_tail(&svc->runq_elem, iter); + + return pos; +} + +static void +runq_insert(unsigned int cpu, struct csched_vcpu *svc) +{ + struct list_head * runq = RUNQ(cpu); + int pos = 0; + + /* FIXME: Runqueue per L2 */ + ASSERT( spin_is_locked(&csched_priv.lock) ); + + BUG_ON( __vcpu_on_runq(svc) ); + /* FIXME: Check runqueue handles this cpu*/ + //BUG_ON( cpu != svc->vcpu->processor ); + + pos = __runq_insert(runq, svc); + + { + struct { + unsigned dom:16,vcpu:16; + unsigned pos; + } d; + d.dom = svc->vcpu->domain->domain_id; + d.vcpu = svc->vcpu->vcpu_id; + d.pos = pos; + trace_var(TRC_CSCHED2_RUNQ_POS, 1, + sizeof(d), + (unsigned char *)&d); + } + + return; +} + +static inline void +__runq_remove(struct csched_vcpu *svc) +{ + BUG_ON( !__vcpu_on_runq(svc) ); + list_del_init(&svc->runq_elem); +} + +void burn_credits(struct csched_vcpu *, s_time_t); + +/* Check to see if the item on the runqueue is higher priority than what's + * currently running; if so, wake up the processor */ +static /*inline*/ void +runq_tickle(unsigned int cpu, struct csched_vcpu *new, s_time_t now) +{ + int i, ipid=-1; + s_time_t lowest=(1<<30); + + d2printk("rqt d%dv%d cd%dv%d\n", + new->vcpu->domain->domain_id, + new->vcpu->vcpu_id, + current->domain->domain_id, + current->vcpu_id); + + /* Find the cpu in this queue group that has the lowest credits */ + /* FIXME: separate runqueues */ + for_each_online_cpu ( i ) + { + struct csched_vcpu * const cur = + CSCHED_VCPU(per_cpu(schedule_data, i).curr); + + /* FIXME: keep track of idlers, chose from the mask */ + if ( is_idle_vcpu(cur->vcpu) ) + { + ipid = i; + lowest = CSCHED_IDLE_CREDIT; + break; + } + else + { + /* Update credits for current to see if we want to preempt */ + burn_credits(cur, now); + + if ( cur->credit < lowest ) + { + ipid = i; + lowest = cur->credit; + } + + /* TRACE */ { + struct { + unsigned dom:16,vcpu:16; + unsigned credit; + } d; + d.dom = cur->vcpu->domain->domain_id; + d.vcpu = cur->vcpu->vcpu_id; + d.credit = cur->credit; + trace_var(TRC_CSCHED2_TICKLE_CHECK, 1, + sizeof(d), + (unsigned char *)&d); + } + } + } + + if ( ipid != -1 ) + { + int cdiff = lowest - new->credit; + + if ( lowest == CSCHED_IDLE_CREDIT || cdiff < 0 ) { + d2printk("si %d\n", ipid); + cpu_raise_softirq(ipid, SCHEDULE_SOFTIRQ); + } + else + /* FIXME: Wake up later? */; + } +} + +/* + * Credit-related code + */ +static void reset_credit(int cpu, s_time_t now) +{ + struct list_head *iter; + + list_for_each( iter, &csched_priv.svc ) + { + struct csched_vcpu * svc = list_entry(iter, struct csched_vcpu, global_elem); + s_time_t cmax; + + BUG_ON( is_idle_vcpu(svc->vcpu) ); + + /* Maximum amount of credit that can be carried over */ + cmax = CSCHED_CARRYOVER_MAX; + + if ( svc->credit > cmax ) + svc->credit = cmax; + svc->credit += CSCHED_CREDIT_INIT; /* Find a better name */ + svc->start_time = now; + + /* Trace credit */ + } + + /* No need to resort runqueue, as everyone's order should be the same. */ +} + +void burn_credits(struct csched_vcpu *svc, s_time_t now) +{ + s_time_t delta; + + /* Assert svc is current */ + ASSERT(svc==CSCHED_VCPU(per_cpu(schedule_data, svc->vcpu->processor).curr)); + + if ( is_idle_vcpu(svc->vcpu) ) + { + BUG_ON(svc->credit != CSCHED_IDLE_CREDIT); + return; + } + + delta = now - svc->start_time; + + if ( delta > 0 ) { + /* This will round down; should we consider rounding up...? */ + svc->credit -= t2c(delta, svc); + svc->start_time = now; + + d2printk("b d%dv%d c%d\n", + svc->vcpu->domain->domain_id, + svc->vcpu->vcpu_id, + svc->credit); + } else { + d2printk("%s: Time went backwards? now %"PRI_stime" start %"PRI_stime"\n", + __func__, now, svc->start_time); + } + + /* TRACE */ + { + struct { + unsigned dom:16,vcpu:16; + unsigned credit; + int delta; + } d; + d.dom = svc->vcpu->domain->domain_id; + d.vcpu = svc->vcpu->vcpu_id; + d.credit = svc->credit; + d.delta = delta; + trace_var(TRC_CSCHED2_CREDIT_BURN, 1, + sizeof(d), + (unsigned char *)&d); + } +} + +/* Find the domain with the highest weight. */ +void update_max_weight(int new_weight, int old_weight) +{ + if ( new_weight > csched_priv.max_weight ) + { + csched_priv.max_weight = new_weight; + printk("%s: Max weight %d\n", __func__, csched_priv.max_weight); + } + else if ( old_weight == csched_priv.max_weight ) + { + struct list_head *iter; + int max_weight = 1; + + list_for_each( iter, &csched_priv.sdom ) + { + struct csched_dom * sdom = list_entry(iter, struct csched_dom, sdom_elem); + + if ( sdom->weight > max_weight ) + max_weight = sdom->weight; + } + + csched_priv.max_weight = max_weight; + printk("%s: Max weight %d\n", __func__, csched_priv.max_weight); + } +} + +/* + * Initialization code + */ +static int +csched_pcpu_init(int cpu) +{ + unsigned long flags; + struct csched_pcpu *spc; + + /* Allocate per-PCPU info */ + spc = xmalloc(struct csched_pcpu); + if ( spc == NULL ) + return -1; + + spin_lock_irqsave(&csched_priv.lock, flags); + + /* Initialize/update system-wide config */ + per_cpu(schedule_data, cpu).sched_priv = spc; + + csched_priv.ncpus++; + + /* Start off idling... */ + BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr)); + + spin_unlock_irqrestore(&csched_priv.lock, flags); + + return 0; +} + +#ifndef NDEBUG +static /*inline*/ void +__csched_vcpu_check(struct vcpu *vc) +{ + struct csched_vcpu * const svc = CSCHED_VCPU(vc); + struct csched_dom * const sdom = svc->sdom; + + BUG_ON( svc->vcpu != vc ); + BUG_ON( sdom != CSCHED_DOM(vc->domain) ); + if ( sdom ) + { + BUG_ON( is_idle_vcpu(vc) ); + BUG_ON( sdom->dom != vc->domain ); + } + else + { + BUG_ON( !is_idle_vcpu(vc) ); + } +} +#define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc)) +#else +#define CSCHED_VCPU_CHECK(_vc) +#endif + +static int +csched_vcpu_init(struct vcpu *vc) +{ + struct domain * const dom = vc->domain; + struct csched_dom *sdom = CSCHED_DOM(dom); + struct csched_vcpu *svc; + + printk("%s: Initializing d%dv%d\n", + __func__, dom->domain_id, vc->vcpu_id); + + /* Allocate per-VCPU info */ + svc = xmalloc(struct csched_vcpu); + if ( svc == NULL ) + return -1; + + INIT_LIST_HEAD(&svc->global_elem); + INIT_LIST_HEAD(&svc->sdom_elem); + INIT_LIST_HEAD(&svc->runq_elem); + + svc->sdom = sdom; + svc->vcpu = vc; + svc->flags = 0U; + vc->sched_priv = svc; + + if ( ! is_idle_vcpu(vc) ) + { + BUG_ON( sdom == NULL ); + + svc->credit = CSCHED_CREDIT_INIT; + svc->weight = sdom->weight; + + list_add_tail(&svc->sdom_elem, &sdom->vcpu); + list_add_tail(&svc->global_elem, &csched_priv.svc); + sdom->nr_vcpus++; + } + else + { + BUG_ON( sdom != NULL ); + svc->credit = CSCHED_IDLE_CREDIT; + svc->weight = 0; + if ( csched_priv.idle_domain == NULL ) + csched_priv.idle_domain = dom; + } + + /* Allocate per-PCPU info */ + if ( unlikely(!CSCHED_PCPU(vc->processor)) ) + { + if ( csched_pcpu_init(vc->processor) != 0 ) + return -1; + } + + CSCHED_VCPU_CHECK(vc); + return 0; +} + +static void +csched_vcpu_destroy(struct vcpu *vc) +{ + struct csched_vcpu * const svc = CSCHED_VCPU(vc); + struct csched_dom * const sdom = svc->sdom; + unsigned long flags; + + BUG_ON( sdom == NULL ); + BUG_ON( !list_empty(&svc->runq_elem) ); + + spin_lock_irqsave(&csched_priv.lock, flags); + + /* Remove from sdom list */ + list_del_init(&svc->global_elem); + list_del_init(&svc->sdom_elem); + + sdom->nr_vcpus--; + + spin_unlock_irqrestore(&csched_priv.lock, flags); + + xfree(svc); +} + +static void +csched_vcpu_sleep(struct vcpu *vc) +{ + struct csched_vcpu * const svc = CSCHED_VCPU(vc); + + BUG_ON( is_idle_vcpu(vc) ); + + if ( per_cpu(schedule_data, vc->processor).curr == vc ) + cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ); + else if ( __vcpu_on_runq(svc) ) + __runq_remove(svc); +} + +static void +csched_vcpu_wake(struct vcpu *vc) +{ + struct csched_vcpu * const svc = CSCHED_VCPU(vc); + const unsigned int cpu = vc->processor; + s_time_t now = 0; + int flags; + + d2printk("w d%dv%d\n", vc->domain->domain_id, vc->vcpu_id); + + BUG_ON( is_idle_vcpu(vc) ); + + /* FIXME: Runqueue per L2 */ + spin_lock_irqsave(&csched_priv.lock, flags); + + + /* Make sure svc priority mod happens before runq check */ + if ( unlikely(per_cpu(schedule_data, cpu).curr == vc) ) + { + goto out; + } + + if ( unlikely(__vcpu_on_runq(svc)) ) + { + /* If we've boosted someone that's already on a runqueue, prioritize + * it and inform the cpu in question. */ + goto out; + } + + /* If the context hasn't been saved for this vcpu yet, we can't put it on + * another runqueue. Instead, we set a flag so that it will be put on the runqueue + * after the context has been saved. */ + if ( unlikely (test_bit(__CSFLAG_scheduled, &svc->flags) ) ) + { + set_bit(__CSFLAG_delayed_runq_add, &svc->flags); + goto out; + } + + now = NOW(); + + /* Put the VCPU on the runq */ + runq_insert(cpu, svc); + runq_tickle(cpu, svc, now); + +out: + spin_unlock_irqrestore(&csched_priv.lock, flags); + d2printk("w-\n"); + return; +} + +static void +csched_context_saved(struct vcpu *vc) +{ + int flags; + struct csched_vcpu * const svc = CSCHED_VCPU(vc); + + spin_lock_irqsave(&csched_priv.lock, flags); + + /* This vcpu is now eligible to be put on the runqueue again */ + clear_bit(__CSFLAG_scheduled, &svc->flags); + + /* If someone wants it there, put it there */ + if ( test_bit(__CSFLAG_delayed_runq_add, &svc->flags) ) + { + const unsigned int cpu = vc->processor; + + clear_bit(__CSFLAG_delayed_runq_add, &svc->flags); + + BUG_ON(__vcpu_on_runq(svc)); + + runq_insert(cpu, svc); + runq_tickle(cpu, svc, NOW()); + } + + spin_unlock_irqrestore(&csched_priv.lock, flags); +} + +static int +csched_cpu_pick(struct vcpu *vc) +{ + /* FIXME: Chose a schedule group based on load */ + return 0; +} + +static int +csched_dom_cntl( + struct domain *d, + struct xen_domctl_scheduler_op *op) +{ + struct csched_dom * const sdom = CSCHED_DOM(d); + unsigned long flags; + + if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo ) + { + op->u.credit2.weight = sdom->weight; + } + else + { + ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo); + + if ( op->u.credit2.weight != 0 ) + { + struct list_head *iter; + int old_weight; + + spin_lock_irqsave(&csched_priv.lock, flags); + + old_weight = sdom->weight; + + sdom->weight = op->u.credit2.weight; + + /* Update max weight */ + update_max_weight(sdom->weight, old_weight); + + /* Update weights for vcpus */ + list_for_each ( iter, &sdom->vcpu ) + { + struct csched_vcpu *svc = list_entry(iter, struct csched_vcpu, sdom_elem); + + svc->weight = sdom->weight; + } + + spin_unlock_irqrestore(&csched_priv.lock, flags); + } + } + + return 0; +} + +static int +csched_dom_init(struct domain *dom) +{ + struct csched_dom *sdom; + int flags; + + printk("%s: Initializing domain %d\n", __func__, dom->domain_id); + + if ( is_idle_domain(dom) ) + return 0; + + sdom = xmalloc(struct csched_dom); + if ( sdom == NULL ) + return -ENOMEM; + + /* Initialize credit and weight */ + INIT_LIST_HEAD(&sdom->vcpu); + INIT_LIST_HEAD(&sdom->sdom_elem); + sdom->dom = dom; + sdom->weight = CSCHED_DEFAULT_WEIGHT; + sdom->nr_vcpus = 0; + + dom->sched_priv = sdom; + + spin_lock_irqsave(&csched_priv.lock, flags); + + update_max_weight(sdom->weight, 0); + list_add_tail(&sdom->sdom_elem, &csched_priv.sdom); + + spin_unlock_irqrestore(&csched_priv.lock, flags); + + return 0; +} + +static void +csched_dom_destroy(struct domain *dom) +{ + struct csched_dom *sdom = CSCHED_DOM(dom); + int flags; + + BUG_ON(!list_empty(&sdom->vcpu)); + + spin_lock_irqsave(&csched_priv.lock, flags); + + list_del_init(&sdom->sdom_elem); + + update_max_weight(0, sdom->weight); + + spin_unlock_irqrestore(&csched_priv.lock, flags); + + xfree(CSCHED_DOM(dom)); +} + +#if 0 +static void csched_load_balance(int cpu) +{ + /* FIXME: Do something. */ +} +#endif + +/* How long should we let this vcpu run for? */ +static s_time_t +csched_runtime(int cpu, struct csched_vcpu *snext) +{ + s_time_t time = CSCHED_MAX_TIMER; + struct list_head *runq = RUNQ(cpu); + + if ( is_idle_vcpu(snext->vcpu) ) + return CSCHED_MAX_TIMER; + + /* Basic time */ + time = c2t(snext->credit, snext); + + /* Next guy on runqueue */ + if ( ! list_empty(runq) ) + { + struct csched_vcpu *svc = __runq_elem(runq->next); + s_time_t ntime; + + if ( ! is_idle_vcpu(svc->vcpu) ) + { + ntime = c2t(snext->credit - svc->credit, snext); + + if ( time > ntime ) + time = ntime; + } + } + + /* Check limits */ + if ( time < CSCHED_MIN_TIMER ) + time = CSCHED_MIN_TIMER; + else if ( time > CSCHED_MAX_TIMER ) + time = CSCHED_MAX_TIMER; + + return time; +} + +void __dump_execstate(void *unused); + +/* + * This function is in the critical path. It is designed to be simple and + * fast for the common case. + */ +static struct task_slice +csched_schedule(s_time_t now) +{ + const int cpu = smp_processor_id(); + struct list_head * const runq = RUNQ(cpu); + //struct csched_pcpu *spc = CSCHED_PCPU(cpu); + struct csched_vcpu * const scurr = CSCHED_VCPU(current); + struct csched_vcpu *snext = NULL; + struct task_slice ret; + int flags; + + CSCHED_VCPU_CHECK(current); + + d2printk("sc p%d c d%dv%d now %"PRI_stime"\n", + cpu, + scurr->vcpu->domain->domain_id, + scurr->vcpu->vcpu_id, + now); + + + /* FIXME: Runqueue per L2 */ + spin_lock_irqsave(&csched_priv.lock, flags); + + /* Update credits */ + burn_credits(scurr, now); + + /* + * Select next runnable local VCPU (ie top of local runq). + * + * If the current vcpu is runnable, and has higher credit than + * the next guy on the queue (or there is noone else), we want to run him again. + * + * If the current vcpu is runnable, and the next guy on the queue + * has higher credit, we want to mark current for delayed runqueue + * add, and remove the next guy from the queue. + * + * If the current vcpu is not runnable, we want to chose the idle + * vcpu for this processor. + */ + if ( list_empty(runq) ) + snext = CSCHED_VCPU(csched_priv.idle_domain->vcpu[cpu]); + else + snext = __runq_elem(runq->next); + + if ( !is_idle_vcpu(current) && vcpu_runnable(current) ) + { + /* If the current vcpu is runnable, and has higher credit + * than the next on the runqueue, run him again. + * Otherwise, set him for delayed runq add. */ + if ( scurr->credit > snext->credit) + snext = scurr; + else + set_bit(__CSFLAG_delayed_runq_add, &scurr->flags); + } + + if ( snext != scurr && !is_idle_vcpu(snext->vcpu) ) + { + __runq_remove(snext); + if ( snext->vcpu->is_running ) + { + printk("p%d: snext d%dv%d running on p%d! scurr d%dv%d\n", + cpu, + snext->vcpu->domain->domain_id, snext->vcpu->vcpu_id, + snext->vcpu->processor, + scurr->vcpu->domain->domain_id, + scurr->vcpu->vcpu_id); + BUG(); + } + set_bit(__CSFLAG_scheduled, &snext->flags); + } + + if ( !is_idle_vcpu(snext->vcpu) && snext->credit <= CSCHED_CREDIT_RESET ) + reset_credit(cpu, now); + + spin_unlock_irqrestore(&csched_priv.lock, flags); + +#if 0 + /* + * Update idlers mask if necessary. When we're idling, other CPUs + * will tickle us when they get extra work. + */ + if ( is_idle_vcpu(snext->vcpu) ) + { + if ( !cpu_isset(cpu, csched_priv.idlers) ) + cpu_set(cpu, csched_priv.idlers); + } + else if ( cpu_isset(cpu, csched_priv.idlers) ) + { + cpu_clear(cpu, csched_priv.idlers); + } +#endif + + if ( !is_idle_vcpu(snext->vcpu) ) + snext->start_time = now; + /* + * Return task to run next... + */ + ret.time = csched_runtime(cpu, snext); + ret.task = snext->vcpu; + + CSCHED_VCPU_CHECK(ret.task); + return ret; +} + +static void +csched_dump_vcpu(struct csched_vcpu *svc) +{ + printk("[%i.%i] flags=%x cpu=%i", + svc->vcpu->domain->domain_id, + svc->vcpu->vcpu_id, + svc->flags, + svc->vcpu->processor); + + printk(" credit=%" PRIi32" [w=%u]", svc->credit, svc->weight); + + printk("\n"); +} + +static void +csched_dump_pcpu(int cpu) +{ + struct list_head *runq, *iter; + //struct csched_pcpu *spc; + struct csched_vcpu *svc; + int loop; + char cpustr[100]; + + //spc = CSCHED_PCPU(cpu); + runq = RUNQ(cpu); + + cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_map,cpu)); + printk(" sibling=%s, ", cpustr); + cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_map,cpu)); + printk("core=%s\n", cpustr); + + /* current VCPU */ + svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr); + if ( svc ) + { + printk("\trun: "); + csched_dump_vcpu(svc); + } + + loop = 0; + list_for_each( iter, runq ) + { + svc = __runq_elem(iter); + if ( svc ) + { + printk("\t%3d: ", ++loop); + csched_dump_vcpu(svc); + } + } +} + +static void +csched_dump(void) +{ + struct list_head *iter_sdom, *iter_svc; + int loop; + + printk("info:\n" + "\tncpus = %u\n" + "\tdefault-weight = %d\n", + csched_priv.ncpus, + CSCHED_DEFAULT_WEIGHT); + + printk("active vcpus:\n"); + loop = 0; + list_for_each( iter_sdom, &csched_priv.sdom ) + { + struct csched_dom *sdom; + sdom = list_entry(iter_sdom, struct csched_dom, sdom_elem); + + list_for_each( iter_svc, &sdom->vcpu ) + { + struct csched_vcpu *svc; + svc = list_entry(iter_svc, struct csched_vcpu, sdom_elem); + + printk("\t%3d: ", ++loop); + csched_dump_vcpu(svc); + } + } +} + +static void +csched_init(void) +{ + spin_lock_init(&csched_priv.lock); + INIT_LIST_HEAD(&csched_priv.sdom); + INIT_LIST_HEAD(&csched_priv.svc); + + csched_priv.ncpus = 0; + + /* FIXME: Runqueue per l2 */ + csched_priv.max_weight = 1; + INIT_LIST_HEAD(&csched_priv.runq); +} + +struct scheduler sched_credit2_def = { + .name = "SMP Credit Scheduler rev2", + .opt_name = "credit2", + .sched_id = XEN_SCHEDULER_CREDIT2, + + .init_domain = csched_dom_init, + .destroy_domain = csched_dom_destroy, + + .init_vcpu = csched_vcpu_init, + .destroy_vcpu = csched_vcpu_destroy, + + .sleep = csched_vcpu_sleep, + .wake = csched_vcpu_wake, + + .adjust = csched_dom_cntl, + + .pick_cpu = csched_cpu_pick, + .do_schedule = csched_schedule, + .context_saved = csched_context_saved, + + .dump_cpu_state = csched_dump_pcpu, + .dump_settings = csched_dump, + .init = csched_init, +}; diff -r 7bd1dd9fb30f xen/common/schedule.c --- a/xen/common/schedule.c Wed Jan 13 14:15:51 2010 +0000 +++ b/xen/common/schedule.c Wed Jan 13 14:36:58 2010 +0000 @@ -58,9 +58,11 @@ extern const struct scheduler sched_sedf_def; extern const struct scheduler sched_credit_def; +extern const struct scheduler sched_credit2_def; static const struct scheduler *__initdata schedulers[] = { &sched_sedf_def, &sched_credit_def, + &sched_credit2_def, NULL }; diff -r 7bd1dd9fb30f xen/include/public/domctl.h --- a/xen/include/public/domctl.h Wed Jan 13 14:15:51 2010 +0000 +++ b/xen/include/public/domctl.h Wed Jan 13 14:36:58 2010 +0000 @@ -295,6 +295,7 @@ /* Scheduler types. */ #define XEN_SCHEDULER_SEDF 4 #define XEN_SCHEDULER_CREDIT 5 +#define XEN_SCHEDULER_CREDIT2 6 /* Set or get info? */ #define XEN_DOMCTL_SCHEDOP_putinfo 0 #define XEN_DOMCTL_SCHEDOP_getinfo 1 @@ -313,6 +314,9 @@ uint16_t weight; uint16_t cap; } credit; + struct xen_domctl_sched_credit2 { + uint16_t weight; + } credit2; } u; }; typedef struct xen_domctl_scheduler_op xen_domctl_scheduler_op_t; diff -r 7bd1dd9fb30f xen/include/public/trace.h --- a/xen/include/public/trace.h Wed Jan 13 14:15:51 2010 +0000 +++ b/xen/include/public/trace.h Wed Jan 13 14:36:58 2010 +0000 @@ -53,6 +53,7 @@ #define TRC_HVM_HANDLER 0x00082000 /* various HVM handlers */ #define TRC_SCHED_MIN 0x00021000 /* Just runstate changes */ +#define TRC_SCHED_CLASS 0x00022000 /* Scheduler-specific */ #define TRC_SCHED_VERBOSE 0x00028000 /* More inclusive scheduling */ /* Trace events per class */