diff -r 23d34c3ba4b7 xen/arch/x86/domain.c --- a/xen/arch/x86/domain.c Mon Nov 30 16:13:01 2009 -0600 +++ b/xen/arch/x86/domain.c Mon Dec 07 16:59:53 2009 +0000 @@ -1426,9 +1426,9 @@ set_current(next); - if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) ) + if ( (per_cpu(curr_vcpu, cpu) == next) /* || is_idle_vcpu(next) */) { - local_irq_enable(); + ;//local_irq_enable(); } else { @@ -1445,9 +1445,8 @@ write_efer(efer | EFER_SCE); } #endif - /* Re-enable interrupts before restoring state which may fault. */ - local_irq_enable(); + //local_irq_enable(); if ( !is_hvm_vcpu(next) ) { @@ -1458,6 +1457,13 @@ context_saved(prev); + local_irq_enable(); + + /* If we've deadlocked somehow and temporarily made a VM unrunnable, + * clear the bit and call wake. */ + if ( test_and_clear_bit(_VPF_deadlock, &prev->pause_flags ) ) + vcpu_wake(prev); + if (prev != next) update_runstate_area(next); diff -r 23d34c3ba4b7 xen/arch/x86/nmi.c --- a/xen/arch/x86/nmi.c Mon Nov 30 16:13:01 2009 -0600 +++ b/xen/arch/x86/nmi.c Mon Dec 07 16:59:53 2009 +0000 @@ -391,7 +391,6 @@ u32 id = cpu_physical_id(cpu); printk("Triggering NMI on APIC ID %x\n", id); - debugtrace_dump(); local_irq_disable(); apic_wait_icr_idle(); @@ -426,11 +425,12 @@ if ( this_cpu(alert_counter) == 5*nmi_hz ) { console_force_unlock(); + spin_lock(&panic_lock); printk("Watchdog timer detects that CPU%d is stuck!\n", smp_processor_id()); - spin_lock(&panic_lock); show_execution_state(regs); debugtrace_dump(); + spin_unlock(&panic_lock); atomic_inc(&all_panic); { int cpu; @@ -441,7 +441,6 @@ do_nmi_trigger_cpu(cpu); } } - spin_unlock(&panic_lock); while(1); //fatal_trap(TRAP_nmi, regs); } diff -r 23d34c3ba4b7 xen/common/Makefile --- a/xen/common/Makefile Mon Nov 30 16:13:01 2009 -0600 +++ b/xen/common/Makefile Mon Dec 07 16:59:53 2009 +0000 @@ -13,6 +13,7 @@ obj-y += page_alloc.o obj-y += rangeset.o obj-y += sched_credit.o +obj-y += sched_credit2.o obj-y += sched_sedf.o obj-y += schedule.o obj-y += shutdown.o diff -r 23d34c3ba4b7 xen/common/sched_credit2.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/common/sched_credit2.c Mon Dec 07 16:59:53 2009 +0000 @@ -0,0 +1,992 @@ + +/**************************************************************************** + * (C) 2009 - George Dunlap - Citrix Systems R&D UK, Ltd + **************************************************************************** + * + * File: common/csched_credit2.c + * Author: George Dunlap + * + * Description: Credit-based SMP CPU scheduler + * Based on an earlier verson by Emmanuel Ackaouy. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if __i386__ +#define PRI_stime "lld" +#else +#define PRI_stime "ld" +#endif + +#define d2printk(x...) +//#define d2printk printk + +#define TRC_CSCHED2_TICK TRC_SCHED_CLASS + 1 +#define TRC_CSCHED2_RUNQ_POS TRC_SCHED_CLASS + 2 +#define TRC_CSCHED2_CREDIT_BURN TRC_SCHED_CLASS + 3 +#define TRC_CSCHED2_CREDIT_ADD TRC_SCHED_CLASS + 4 +#define TRC_CSCHED2_TICKLE_CHECK TRC_SCHED_CLASS + 5 + +/* + * Design: + * + * VMs "burn" credits based on their weight; higher weight means credits burn + * more slowly. + * + * vcpus are inserted into the runqueue by credit order. + * + * Credits are "reset" when the next vcpu in the runqueue is less than or equal to zero. At that + * point, everyone's credits are "clipped" to a small value, and a fixed credit is added to everyone. + * + * The plan is for all cores that share an L2 will share the same runqueue. At the moment, there is + * one global runqueue for all cores. + */ + +/* + * Basic constants + */ +#define CSCHED_DEFAULT_WEIGHT 256 +#define CSCHED_MIN_TIMER MICROSECS(500) +#define CSCHED_CARRYOVER_MAX CSCHED_MIN_TIMER +#define CSCHED_CREDIT_RESET 0 +#define CSCHED_CREDIT_INIT MILLISECS(10) +#define CSCHED_MAX_TIMER MILLISECS(2) + +#define CSCHED_IDLE_CREDIT (-(1<<30)) + +/* + * Flags + */ +// Placeholder template for when we need real flags +//#define __CSFLAG_foo 1 +//#define CSFLAG_foo (1<<__CSFLAG_foo) + + +/* + * Useful macros + */ +#define CSCHED_PCPU(_c) \ + ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv) +#define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv) +#define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv) +//#define RUNQ(_cpu) (&(CSCHED_GROUP(_cpu)->runq)) +#define RUNQ(_cpu) (&csched_priv.runq) + +/* + * System-wide private data + */ +struct csched_private { + spinlock_t lock; + struct list_head sdom; + struct list_head svc; /* List of all vcpus */ + uint32_t ncpus; + + /* Per-runqueue info */ + struct list_head runq; /* Global runqueue */ + int max_weight; +}; + +struct csched_pcpu { + int _dummy; +}; + +/* + * Virtual CPU + */ +struct csched_vcpu { + struct list_head global_elem; /* On the global vcpu list */ + struct list_head sdom_elem; /* On the domain vcpu list */ + struct list_head runq_elem; /* On the runqueue */ + + /* Up-pointers */ + struct csched_dom *sdom; + struct vcpu *vcpu; + + int weight; + + int credit; + s_time_t start_time; /* When we were scheduled (used for credit) */ + unsigned flags; /* 16 bits doesn't seem to play well with clear_bit() */ + +}; + +/* + * Domain + */ +struct csched_dom { + struct list_head vcpu; + struct list_head sdom_elem; + struct domain *dom; + uint16_t weight; + uint16_t nr_vcpus; +}; + + +/* + * Global variables + */ +static struct csched_private csched_priv; + +/* + * Time-to-credit, credit-to-time. + * FIXME: Do pre-calculated division? + */ +static s_time_t t2c(s_time_t time, struct csched_vcpu *svc) +{ + return time * csched_priv.max_weight / svc->weight; +} + +static s_time_t c2t(s_time_t credit, struct csched_vcpu *svc) +{ + return credit * svc->weight / csched_priv.max_weight; +} + +/* + * Runqueue related code + */ + +static /*inline*/ int +__vcpu_on_runq(struct csched_vcpu *svc) +{ + return !list_empty(&svc->runq_elem); +} + +static /*inline*/ struct csched_vcpu * +__runq_elem(struct list_head *elem) +{ + return list_entry(elem, struct csched_vcpu, runq_elem); +} + +static int +__runq_insert(struct list_head *runq, struct csched_vcpu *svc) +{ + struct list_head *iter; + int pos = 0; + + d2printk("rqi d%dv%d\n", + svc->vcpu->domain->domain_id, + svc->vcpu->vcpu_id); + + list_for_each( iter, runq ) + { + struct csched_vcpu * iter_svc = __runq_elem(iter); + + if ( svc->credit > iter_svc->credit ) + { + d2printk(" p%d d%dv%d\n", + pos, + iter_svc->vcpu->domain->domain_id, + iter_svc->vcpu->vcpu_id); + break; + } + pos++; + } + + list_add_tail(&svc->runq_elem, iter); + + return pos; +} + +static void +runq_insert(unsigned int cpu, struct csched_vcpu *svc) +{ + struct list_head * runq = RUNQ(cpu); + int pos = 0; + + /* FIXME: Runqueue per L2 */ + ASSERT( spin_is_locked(&csched_priv.lock) ); + + BUG_ON( __vcpu_on_runq(svc) ); + /* FIXME: Check runqueue handles this cpu*/ + //BUG_ON( cpu != svc->vcpu->processor ); + + pos = __runq_insert(runq, svc); + + { + struct { + unsigned dom:16,vcpu:16; + unsigned pos; + } d; + d.dom = svc->vcpu->domain->domain_id; + d.vcpu = svc->vcpu->vcpu_id; + d.pos = pos; + trace_var(TRC_CSCHED2_RUNQ_POS, 1, + sizeof(d), + (unsigned char *)&d); + } + + return; +} + +static inline void +__runq_remove(struct csched_vcpu *svc) +{ + BUG_ON( !__vcpu_on_runq(svc) ); + list_del_init(&svc->runq_elem); +} + +void burn_credits(struct csched_vcpu *, s_time_t); + +/* Check to see if the item on the runqueue is higher priority than what's + * currently running; if so, wake up the processor */ +static /*inline*/ void +runq_tickle(unsigned int cpu, struct csched_vcpu *new, s_time_t now) +{ + int i, ipid=-1; + s_time_t lowest=(1<<30); + + d2printk("rqt d%dv%d cd%dv%d\n", + new->vcpu->domain->domain_id, + new->vcpu->vcpu_id, + current->domain->domain_id, + current->vcpu_id); + + /* Find the cpu in this queue group that has the lowest credits */ + /* FIXME: separate runqueues */ + for_each_online_cpu ( i ) + { + struct csched_vcpu * const cur = + CSCHED_VCPU(per_cpu(schedule_data, i).curr); + + /* FIXME: keep track of idlers, chose from the mask */ + if ( is_idle_vcpu(cur->vcpu) ) + { + ipid = i; + lowest = CSCHED_IDLE_CREDIT; + break; + } + else + { + /* Update credits for current to see if we want to preempt */ + burn_credits(cur, now); + + if ( cur->credit < lowest ) + { + ipid = i; + lowest = cur->credit; + } + + /* TRACE */ { + struct { + unsigned dom:16,vcpu:16; + unsigned credit; + } d; + d.dom = cur->vcpu->domain->domain_id; + d.vcpu = cur->vcpu->vcpu_id; + d.credit = cur->credit; + trace_var(TRC_CSCHED2_TICKLE_CHECK, 1, + sizeof(d), + (unsigned char *)&d); + } + } + } + + if ( ipid != -1 ) + { + int cdiff = lowest - new->credit; + + if ( lowest == CSCHED_IDLE_CREDIT || cdiff < 0 ) { + d2printk("si %d\n", ipid); + cpu_raise_softirq(ipid, SCHEDULE_SOFTIRQ); + } + else + /* FIXME: Wake up later? */; + } +} + +/* + * Credit-related code + */ +static void reset_credit(int cpu, s_time_t now) +{ + struct list_head *iter; + + list_for_each( iter, &csched_priv.svc ) + { + struct csched_vcpu * svc = list_entry(iter, struct csched_vcpu, global_elem); + s_time_t cmax; + + BUG_ON( is_idle_vcpu(svc->vcpu) ); + + /* Maximum amount of credit that can be carried over */ + cmax = CSCHED_CARRYOVER_MAX; + + if ( svc->credit > cmax ) + svc->credit = cmax; + svc->credit += CSCHED_CREDIT_INIT; /* Find a better name */ + svc->start_time = now; + + /* Trace credit */ + } + + /* No need to resort runqueue, as everyone's order should be the same. */ +} + +void burn_credits(struct csched_vcpu *svc, s_time_t now) +{ + s_time_t delta; + + /* Assert svc is current */ + ASSERT(svc==CSCHED_VCPU(per_cpu(schedule_data, svc->vcpu->processor).curr)); + + if ( is_idle_vcpu(svc->vcpu) ) + { + BUG_ON(svc->credit != CSCHED_IDLE_CREDIT); + return; + } + + delta = now - svc->start_time; + + if ( delta > 0 ) { + /* This will round down; should we consider rounding up...? */ + svc->credit -= t2c(delta, svc); + svc->start_time = now; + + d2printk("b d%dv%d c%d\n", + svc->vcpu->domain->domain_id, + svc->vcpu->vcpu_id, + svc->credit); + } else { + d2printk("%s: Time went backwards? now %"PRI_stime" start %"PRI_stime"\n", + __func__, now, svc->start_time); + } + + /* TRACE */ + { + struct { + unsigned dom:16,vcpu:16; + unsigned credit; + int delta; + } d; + d.dom = svc->vcpu->domain->domain_id; + d.vcpu = svc->vcpu->vcpu_id; + d.credit = svc->credit; + d.delta = delta; + trace_var(TRC_CSCHED2_CREDIT_BURN, 1, + sizeof(d), + (unsigned char *)&d); + } +} + +/* Find the domain with the highest weight. */ +void update_max_weight(int new_weight, int old_weight) +{ + if ( new_weight > csched_priv.max_weight ) + { + csched_priv.max_weight = new_weight; + printk("%s: Max weight %d\n", __func__, csched_priv.max_weight); + } + else if ( old_weight == csched_priv.max_weight ) + { + struct list_head *iter; + int max_weight = 1; + + list_for_each( iter, &csched_priv.sdom ) + { + struct csched_dom * sdom = list_entry(iter, struct csched_dom, sdom_elem); + + if ( sdom->weight > max_weight ) + max_weight = sdom->weight; + } + + csched_priv.max_weight = max_weight; + printk("%s: Max weight %d\n", __func__, csched_priv.max_weight); + } +} + +/* + * Initialization code + */ +static int +csched_pcpu_init(int cpu) +{ + unsigned long flags; + struct csched_pcpu *spc; + + /* Allocate per-PCPU info */ + spc = xmalloc(struct csched_pcpu); + if ( spc == NULL ) + return -1; + + spin_lock_irqsave(&csched_priv.lock, flags); + + /* Initialize/update system-wide config */ + per_cpu(schedule_data, cpu).sched_priv = spc; + + csched_priv.ncpus++; + + /* Start off idling... */ + BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr)); + + spin_unlock_irqrestore(&csched_priv.lock, flags); + + return 0; +} + +#ifndef NDEBUG +static /*inline*/ void +__csched_vcpu_check(struct vcpu *vc) +{ + struct csched_vcpu * const svc = CSCHED_VCPU(vc); + struct csched_dom * const sdom = svc->sdom; + + BUG_ON( svc->vcpu != vc ); + BUG_ON( sdom != CSCHED_DOM(vc->domain) ); + if ( sdom ) + { + BUG_ON( is_idle_vcpu(vc) ); + BUG_ON( sdom->dom != vc->domain ); + } + else + { + BUG_ON( !is_idle_vcpu(vc) ); + } +} +#define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc)) +#else +#define CSCHED_VCPU_CHECK(_vc) +#endif + +static int +csched_vcpu_init(struct vcpu *vc) +{ + struct domain * const dom = vc->domain; + struct csched_dom *sdom = CSCHED_DOM(dom); + struct csched_vcpu *svc; + + printk("%s: Initializing d%dv%d\n", + __func__, dom->domain_id, vc->vcpu_id); + + /* Allocate per-VCPU info */ + svc = xmalloc(struct csched_vcpu); + if ( svc == NULL ) + return -1; + + INIT_LIST_HEAD(&svc->global_elem); + INIT_LIST_HEAD(&svc->sdom_elem); + INIT_LIST_HEAD(&svc->runq_elem); + + svc->sdom = sdom; + svc->vcpu = vc; + svc->flags = 0U; + vc->sched_priv = svc; + + if ( ! is_idle_vcpu(vc) ) + { + BUG_ON( sdom == NULL ); + + svc->credit = CSCHED_CREDIT_INIT; + svc->weight = sdom->weight; + + list_add_tail(&svc->sdom_elem, &sdom->vcpu); + list_add_tail(&svc->global_elem, &csched_priv.svc); + sdom->nr_vcpus++; + } + else + { + BUG_ON( sdom != NULL ); + svc->credit = CSCHED_IDLE_CREDIT; + svc->weight = 0; + } + + /* Allocate per-PCPU info */ + if ( unlikely(!CSCHED_PCPU(vc->processor)) ) + { + if ( csched_pcpu_init(vc->processor) != 0 ) + return -1; + } + + CSCHED_VCPU_CHECK(vc); + return 0; +} + +static void +csched_vcpu_destroy(struct vcpu *vc) +{ + struct csched_vcpu * const svc = CSCHED_VCPU(vc); + struct csched_dom * const sdom = svc->sdom; + unsigned long flags; + + BUG_ON( sdom == NULL ); + BUG_ON( !list_empty(&svc->runq_elem) ); + + spin_lock_irqsave(&csched_priv.lock, flags); + + /* Remove from sdom list */ + list_del_init(&svc->global_elem); + list_del_init(&svc->sdom_elem); + + sdom->nr_vcpus--; + + spin_unlock_irqrestore(&csched_priv.lock, flags); + + xfree(svc); +} + +static void +csched_vcpu_sleep(struct vcpu *vc) +{ + struct csched_vcpu * const svc = CSCHED_VCPU(vc); + + BUG_ON( is_idle_vcpu(vc) ); + + if ( per_cpu(schedule_data, vc->processor).curr == vc ) + cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ); + else if ( __vcpu_on_runq(svc) ) + __runq_remove(svc); +} + +static void +csched_vcpu_wake(struct vcpu *vc) +{ + struct csched_vcpu * const svc = CSCHED_VCPU(vc); + const unsigned int cpu = vc->processor; + s_time_t now = 0; + int flags; + + d2printk("w d%dv%d\n", vc->domain->domain_id, vc->vcpu_id); + + BUG_ON( is_idle_vcpu(vc) ); + + /* FIXME: Runqueue per L2 */ + spin_lock_irqsave(&csched_priv.lock, flags); + + + /* Make sure svc priority mod happens before runq check */ + if ( unlikely(per_cpu(schedule_data, cpu).curr == vc) ) + { + goto out; + } + if ( unlikely(__vcpu_on_runq(svc)) ) + { + /* If we've boosted someone that's already on a runqueue, prioritize + * it and inform the cpu in question. */ + goto out; + } + + now = NOW(); + + /* Put the VCPU on the runq */ + runq_insert(cpu, svc); + runq_tickle(cpu, svc, now); + +out: + spin_unlock_irqrestore(&csched_priv.lock, flags); + d2printk("w-\n"); + return; +} + +static int +csched_cpu_pick(struct vcpu *vc) +{ + /* FIXME: Chose a schedule group based on load */ + return 0; +} + +static int +csched_dom_cntl( + struct domain *d, + struct xen_domctl_scheduler_op *op) +{ + struct csched_dom * const sdom = CSCHED_DOM(d); + unsigned long flags; + + if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo ) + { + op->u.credit2.weight = sdom->weight; + } + else + { + ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo); + + if ( op->u.credit2.weight != 0 ) + { + struct list_head *iter; + int old_weight; + + spin_lock_irqsave(&csched_priv.lock, flags); + + old_weight = sdom->weight; + + sdom->weight = op->u.credit2.weight; + + /* Update max weight */ + update_max_weight(sdom->weight, old_weight); + + /* Update weights for vcpus */ + list_for_each ( iter, &sdom->vcpu ) + { + struct csched_vcpu *svc = list_entry(iter, struct csched_vcpu, sdom_elem); + + svc->weight = sdom->weight; + } + + spin_unlock_irqrestore(&csched_priv.lock, flags); + } + } + + return 0; +} + +static int +csched_dom_init(struct domain *dom) +{ + struct csched_dom *sdom; + int flags; + + printk("%s: Initializing domain %d\n", __func__, dom->domain_id); + + if ( is_idle_domain(dom) ) + return 0; + + sdom = xmalloc(struct csched_dom); + if ( sdom == NULL ) + return -ENOMEM; + + /* Initialize credit and weight */ + INIT_LIST_HEAD(&sdom->vcpu); + INIT_LIST_HEAD(&sdom->sdom_elem); + sdom->dom = dom; + sdom->weight = CSCHED_DEFAULT_WEIGHT; + sdom->nr_vcpus = 0; + + dom->sched_priv = sdom; + + spin_lock_irqsave(&csched_priv.lock, flags); + + update_max_weight(sdom->weight, 0); + + list_add_tail(&sdom->sdom_elem, &csched_priv.sdom); + + spin_unlock_irqrestore(&csched_priv.lock, flags); + + return 0; +} + +static void +csched_dom_destroy(struct domain *dom) +{ + struct csched_dom *sdom = CSCHED_DOM(dom); + int flags; + + BUG_ON(!list_empty(&sdom->vcpu)); + + spin_lock_irqsave(&csched_priv.lock, flags); + + list_del_init(&sdom->sdom_elem); + + update_max_weight(0, sdom->weight); + + spin_unlock_irqrestore(&csched_priv.lock, flags); + + xfree(CSCHED_DOM(dom)); +} + +#if 0 +static void csched_load_balance(int cpu) +{ + /* FIXME: Do something. */ +} +#endif + +/* How long should we let this vcpu run for? */ +static s_time_t +csched_runtime(int cpu, struct csched_vcpu *snext) +{ + s_time_t time = CSCHED_MAX_TIMER; + struct list_head *runq = RUNQ(cpu); + + if ( is_idle_vcpu(snext->vcpu) ) + return CSCHED_MAX_TIMER; + + /* Basic time */ + time = c2t(snext->credit, snext); + + /* Next guy on runqueue */ + if ( ! list_empty(runq) ) + { + struct csched_vcpu *svc = __runq_elem(runq->next); + s_time_t ntime; + + if ( ! is_idle_vcpu(svc->vcpu) ) + { + ntime = c2t(snext->credit - svc->credit, snext); + + if ( time > ntime ) + time = ntime; + } + } + + /* Check limits */ + if ( time < CSCHED_MIN_TIMER ) + time = CSCHED_MIN_TIMER; + else if ( time > CSCHED_MAX_TIMER ) + time = CSCHED_MAX_TIMER; + + return time; +} + +void __dump_execstate(void *unused); + +/* + * This function is in the critical path. It is designed to be simple and + * fast for the common case. + */ +static struct task_slice +csched_schedule(s_time_t now) +{ + const int cpu = smp_processor_id(); + struct list_head * const runq = RUNQ(cpu); + //struct csched_pcpu *spc = CSCHED_PCPU(cpu); + struct csched_vcpu * const scurr = CSCHED_VCPU(current); + struct csched_vcpu *snext; + struct task_slice ret; + int flags; + + CSCHED_VCPU_CHECK(current); + + d2printk("sc p%d c d%dv%d now %"PRI_stime"\n", + cpu, + scurr->vcpu->domain->domain_id, + scurr->vcpu->vcpu_id, + now); + + + /* FIXME: Runqueue per L2 */ + spin_lock_irqsave(&csched_priv.lock, flags); + + /* Update credits */ + burn_credits(scurr, now); + + /* + * Select next runnable local VCPU (ie top of local runq) + * Insert will cause credits to be updated. + */ + if ( vcpu_runnable(current) ) + runq_insert(cpu, scurr); + else + BUG_ON( is_idle_vcpu(current) || list_empty(runq) ); + + snext = __runq_elem(runq->next); + + if ( snext->credit <= CSCHED_CREDIT_RESET && !is_idle_vcpu(snext->vcpu) ) + { + /* If the next item has <= 0 credits, update credits and resort */ + reset_credit(cpu, now); + } + + __runq_remove(snext); + + /* HACK. Multiple cpus are sharing a runqueue; but due to the way + * things are set up, it's possible for a vcpu to be scheduled out on one + * cpu and put on the runqueue, and taken off by another cpu, before the first + * cpu has actually completed the context switch (indicated by is_running). + * + * So in general we just wait for is_running to be false, always checking + * to see if it should still be put on the runqueue (i.e., it may be + * paused). + * + * Even so, occasionally we get into a deadlock situation. I haven't found + * out who the other "hold-and-wait"-er is because they seem to have + * irqs disabled. In any case, if we spin for 65K times, we assume there's + * a deadlock and put the vcpu on the tail of the runqueue (yes, behind the + * idle vcpus). It will be re-ordered at most 10ms later when we do a + * runqueue sort. + * + * Other hold-and-waiters: + * + flush_tlb_mask(), which will try to get a sync_lazy_execstate. + * + vcpu_wake(): if an interrupt that causes a wake happens between unlock in schedule + * and irq_disable() in context_switch(), it tries to grab the vcpu's cpu's schedule lock + * (which we're holding). + **/ + if ( snext != scurr && snext->vcpu->is_running ) + { + int count = 0; + do { + BUG_ON(count < 0); + count++; + + if ( (count & 0xffff) == 0 ) { + printk("p%d d%dv%d running on p%d, passed %d iterations!\n", + cpu, snext->vcpu->domain->domain_id, + snext->vcpu->vcpu_id, + snext->vcpu->processor, + count); + set_bit(_VPF_deadlock, &snext->vcpu->pause_flags); + BUG_ON( vcpu_runnable(snext->vcpu) ); + + } else if ( vcpu_runnable(snext->vcpu) ) + runq_insert(cpu, snext); + + BUG_ON(list_empty(runq)); + + snext = __runq_elem(runq->next); + __runq_remove(snext); + } while ( snext != scurr && snext->vcpu->is_running ); + //printk("done\n"); + } + + /* FIXME: Think about this some more. */ + snext->vcpu->processor = cpu; + + spin_unlock_irqrestore(&csched_priv.lock, flags); + +#if 0 + /* + * Update idlers mask if necessary. When we're idling, other CPUs + * will tickle us when they get extra work. + */ + if ( is_idle_vcpu(snext->vcpu) ) + { + if ( !cpu_isset(cpu, csched_priv.idlers) ) + cpu_set(cpu, csched_priv.idlers); + } + else if ( cpu_isset(cpu, csched_priv.idlers) ) + { + cpu_clear(cpu, csched_priv.idlers); + } +#endif + + if ( !is_idle_vcpu(snext->vcpu) ) + snext->start_time = now; + /* + * Return task to run next... + */ + ret.time = csched_runtime(cpu, snext); + ret.task = snext->vcpu; + + CSCHED_VCPU_CHECK(ret.task); + return ret; +} + +static void +csched_dump_vcpu(struct csched_vcpu *svc) +{ + printk("[%i.%i] flags=%x cpu=%i", + svc->vcpu->domain->domain_id, + svc->vcpu->vcpu_id, + svc->flags, + svc->vcpu->processor); + + printk(" credit=%" PRIi32" [w=%u]", svc->credit, svc->weight); + + printk("\n"); +} + +static void +csched_dump_pcpu(int cpu) +{ + struct list_head *runq, *iter; + //struct csched_pcpu *spc; + struct csched_vcpu *svc; + int loop; + char cpustr[100]; + + //spc = CSCHED_PCPU(cpu); + runq = RUNQ(cpu); + + cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_map,cpu)); + printk(" sibling=%s, ", cpustr); + cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_map,cpu)); + printk("core=%s\n", cpustr); + + /* current VCPU */ + svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr); + if ( svc ) + { + printk("\trun: "); + csched_dump_vcpu(svc); + } + + loop = 0; + list_for_each( iter, runq ) + { + svc = __runq_elem(iter); + if ( svc ) + { + printk("\t%3d: ", ++loop); + csched_dump_vcpu(svc); + } + } +} + +static void +csched_dump(void) +{ + struct list_head *iter_sdom, *iter_svc; + int loop; + + printk("info:\n" + "\tncpus = %u\n" + "\tdefault-weight = %d\n", + csched_priv.ncpus, + CSCHED_DEFAULT_WEIGHT); + + printk("active vcpus:\n"); + loop = 0; + list_for_each( iter_sdom, &csched_priv.sdom ) + { + struct csched_dom *sdom; + sdom = list_entry(iter_sdom, struct csched_dom, sdom_elem); + + list_for_each( iter_svc, &sdom->vcpu ) + { + struct csched_vcpu *svc; + svc = list_entry(iter_svc, struct csched_vcpu, sdom_elem); + + printk("\t%3d: ", ++loop); + csched_dump_vcpu(svc); + } + } +} + +static void +csched_init(void) +{ + spin_lock_init(&csched_priv.lock); + INIT_LIST_HEAD(&csched_priv.sdom); + INIT_LIST_HEAD(&csched_priv.svc); + + csched_priv.ncpus = 0; + + /* FIXME: Runqueue per l2 */ + csched_priv.max_weight = 1; + INIT_LIST_HEAD(&csched_priv.runq); +} + +struct scheduler sched_credit2_def = { + .name = "SMP Credit Scheduler rev2", + .opt_name = "credit2", + .sched_id = XEN_SCHEDULER_CREDIT2, + + .init_domain = csched_dom_init, + .destroy_domain = csched_dom_destroy, + + .init_vcpu = csched_vcpu_init, + .destroy_vcpu = csched_vcpu_destroy, + + .sleep = csched_vcpu_sleep, + .wake = csched_vcpu_wake, + + .adjust = csched_dom_cntl, + + .pick_cpu = csched_cpu_pick, + .do_schedule = csched_schedule, + + .dump_cpu_state = csched_dump_pcpu, + .dump_settings = csched_dump, + .init = csched_init, +}; diff -r 23d34c3ba4b7 xen/common/schedule.c --- a/xen/common/schedule.c Mon Nov 30 16:13:01 2009 -0600 +++ b/xen/common/schedule.c Mon Dec 07 16:59:53 2009 +0000 @@ -58,9 +58,11 @@ extern const struct scheduler sched_sedf_def; extern const struct scheduler sched_credit_def; +extern const struct scheduler sched_credit2_def; static const struct scheduler *__initdata schedulers[] = { &sched_sedf_def, &sched_credit_def, + &sched_credit2_def, NULL }; diff -r 23d34c3ba4b7 xen/include/public/domctl.h --- a/xen/include/public/domctl.h Mon Nov 30 16:13:01 2009 -0600 +++ b/xen/include/public/domctl.h Mon Dec 07 16:59:53 2009 +0000 @@ -297,6 +297,7 @@ /* Scheduler types. */ #define XEN_SCHEDULER_SEDF 4 #define XEN_SCHEDULER_CREDIT 5 +#define XEN_SCHEDULER_CREDIT2 6 /* Set or get info? */ #define XEN_DOMCTL_SCHEDOP_putinfo 0 #define XEN_DOMCTL_SCHEDOP_getinfo 1 @@ -315,6 +316,9 @@ uint16_t weight; uint16_t cap; } credit; + struct xen_domctl_sched_credit2 { + uint16_t weight; + } credit2; } u; }; typedef struct xen_domctl_scheduler_op xen_domctl_scheduler_op_t; diff -r 23d34c3ba4b7 xen/include/public/trace.h --- a/xen/include/public/trace.h Mon Nov 30 16:13:01 2009 -0600 +++ b/xen/include/public/trace.h Mon Dec 07 16:59:53 2009 +0000 @@ -53,6 +53,7 @@ #define TRC_HVM_HANDLER 0x00082000 /* various HVM handlers */ #define TRC_SCHED_MIN 0x00021000 /* Just runstate changes */ +#define TRC_SCHED_CLASS 0x00022000 /* Scheduler-specific */ #define TRC_SCHED_VERBOSE 0x00028000 /* More inclusive scheduling */ /* Trace events per class */ diff -r 23d34c3ba4b7 xen/include/xen/sched.h --- a/xen/include/xen/sched.h Mon Nov 30 16:13:01 2009 -0600 +++ b/xen/include/xen/sched.h Mon Dec 07 16:59:53 2009 +0000 @@ -530,6 +530,8 @@ /* VCPU affinity has changed: migrating to a new CPU. */ #define _VPF_migrating 3 #define VPF_migrating (1UL<<_VPF_migrating) +#define _VPF_deadlock 4 +#define VPF_deadlock (1UL<<_VPF_migrating) static inline int vcpu_runnable(struct vcpu *v) {