* Ryan Harper <ryanh@xxxxxxxxxx> [2005-05-20 11:55]:
> The following patch creates a new hypercall, do_confer() which allows a
Oops. I left in fixes to my domU config which doesnt exist in the main
tree. I've got that part removed in this version.
--
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
(512) 838-9253 T/L: 678-9253
ryanh@xxxxxxxxxx
diffstat output:
linux-2.6.11-xen-sparse/arch/i386/lib/Makefile | 11
linux-2.6.11-xen-sparse/arch/i386/lib/locks.c | 76 +++++
linux-2.6.11-xen-sparse/arch/xen/i386/kernel/entry.S | 2
linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/hypercall.h | 16 +
linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/spinlock.h | 140 ++++++++---
xen/arch/x86/domain.c | 2
xen/arch/x86/x86_32/entry.S | 1
xen/common/domain.c | 1
xen/common/schedule.c | 69 +++++
xen/include/public/xen.h | 11
xen/include/xen/sched.h | 9
11 files changed, 300 insertions(+), 38 deletions(-)
Signed-off-by: Ryan Harper <ryanh@xxxxxxxxxx>
---
diff -urN b/linux-2.6.11-xen-sparse/arch/i386/lib/locks.c
confer/linux-2.6.11-xen-sparse/arch/i386/lib/locks.c
--- b/linux-2.6.11-xen-sparse/arch/i386/lib/locks.c 1969-12-31
18:00:00.000000000 -0600
+++ confer/linux-2.6.11-xen-sparse/arch/i386/lib/locks.c 2005-05-20
10:37:58.300767080 -0500
@@ -0,0 +1,76 @@
+/*
+ * Spin and read/write lock operations.
+ *
+ * Copyright (C) 2001-2004 Paul Mackerras <paulus@xxxxxxxxxx>, IBM
+ * Copyright (C) 2001 Anton Blanchard <anton@xxxxxxxxxx>, IBM
+ * Copyright (C) 2002 Dave Engebretsen <engebret@xxxxxxxxxx>, IBM
+ * Rework to support virtual processors
+ * Copyright (C) 2005 Ryan Harper <ryanh@xxxxxxxxxx>, IBM
+ * Rework for Xen on x86
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/stringify.h>
+#include <asm/hypercall.h>
+#include <asm/processor.h>
+
+/* waiting for a spinlock... */
+#if defined(CONFIG_XEN) && defined(CONFIG_SMP)
+void __spin_yield(spinlock_t *lock)
+{
+ unsigned int lock_value, holder_cpu, yield_count;
+ shared_info_t *s = HYPERVISOR_shared_info;
+
+ lock_value = lock->slock;
+ if (lock_value == 1)
+ return;
+ holder_cpu = lock->cpu;
+ BUG_ON(holder_cpu >= NR_CPUS);
+ yield_count = s->vcpu_data[holder_cpu].yield_count;
+ if ((yield_count & 1) == 0)
+ return; /* virtual cpu is currently running */
+ rmb();
+ if (lock->slock != lock_value)
+ return; /* something has changed */
+ HYPERVISOR_confer(holder_cpu, yield_count);
+}
+
+void __rw_yield(rwlock_t *rw)
+{
+ unsigned int lock_value, holder_cpu, yield_count;
+ shared_info_t *s = HYPERVISOR_shared_info;
+
+ lock_value = rw->lock;
+ if (lock_value == RW_LOCK_BIAS)
+ return;
+ holder_cpu = rw->cpu;
+ BUG_ON(holder_cpu >= NR_CPUS);
+ yield_count = s->vcpu_data[holder_cpu].yield_count;
+ if ((yield_count & 1) == 0)
+ return; /* virtual cpu is currently running */
+ rmb();
+ if (rw->lock != lock_value)
+ return; /* something has changed */
+ HYPERVISOR_confer(holder_cpu, yield_count);
+}
+
+void spin_unlock_wait(spinlock_t *lock)
+{
+ while (spin_is_locked(lock)) {
+ cpu_relax();
+ if (SHARED_PROCESSOR)
+ __spin_yield(lock);
+ }
+ cpu_relax();
+}
+EXPORT_SYMBOL(spin_unlock_wait);
+#endif
diff -urN b/linux-2.6.11-xen-sparse/arch/i386/lib/Makefile
confer/linux-2.6.11-xen-sparse/arch/i386/lib/Makefile
--- b/linux-2.6.11-xen-sparse/arch/i386/lib/Makefile 1969-12-31
18:00:00.000000000 -0600
+++ confer/linux-2.6.11-xen-sparse/arch/i386/lib/Makefile 2005-05-20
10:37:58.301766928 -0500
@@ -0,0 +1,11 @@
+#
+# Makefile for i386-specific library files..
+#
+
+
+lib-y = checksum.o delay.o usercopy.o getuser.o memcpy.o strstr.o \
+ bitops.o
+
+lib-$(CONFIG_X86_USE_3DNOW) += mmx.o
+lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
+lib-$(CONFIG_XEN) += locks.o
diff -urN b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/entry.S
confer/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/entry.S
--- b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/entry.S 2005-05-19
22:20:32.000000000 -0500
+++ confer/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/entry.S 2005-05-20
10:37:58.304766472 -0500
@@ -80,7 +80,7 @@
#define evtchn_upcall_pending /* 0 */
#define evtchn_upcall_mask 1
-#define sizeof_vcpu_shift 3
+#define sizeof_vcpu_shift 4
#ifdef CONFIG_SMP
#define preempt_disable(reg) incl TI_preempt_count(reg)
diff -urN b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/hypercall.h
confer/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/hypercall.h
--- b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/hypercall.h
2005-05-19 22:20:32.000000000 -0500
+++ confer/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/hypercall.h
2005-05-20 10:37:58.306766168 -0500
@@ -517,4 +517,20 @@
return ret;
}
+static inline int
+HYPERVISOR_confer(
+ unsigned int vcpu, unsigned int yield_count)
+{
+ int ret;
+ unsigned long ign1, ign2;
+
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret), "=b" (ign1), "=c" (ign2)
+ : "0" (__HYPERVISOR_confer), "1" (vcpu), "2" (yield_count)
+ : "memory");
+
+ return ret;
+}
+
#endif /* __HYPERCALL_H__ */
diff -urN b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/spinlock.h
confer/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/spinlock.h
--- b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/spinlock.h
2005-05-19 22:20:14.000000000 -0500
+++ confer/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/spinlock.h
2005-05-20 10:37:58.307766016 -0500
@@ -22,10 +22,36 @@
#ifdef CONFIG_PREEMPT
unsigned int break_lock;
#endif
+#if defined(CONFIG_XEN) && defined(CONFIG_SMP)
+ unsigned int cpu;
+#endif
} spinlock_t;
#define SPINLOCK_MAGIC 0xdead4ead
+/*
+ * Read-write spinlocks, allowing multiple readers
+ * but only one writer.
+ *
+ * NOTE! it is quite common to have readers in interrupts
+ * but no interrupt writers. For those circumstances we
+ * can "mix" irq-safe locks - any writer needs to get a
+ * irq-safe write-lock, but readers can get non-irqsafe
+ * read-locks.
+ */
+typedef struct {
+ volatile unsigned int lock;
+#ifdef CONFIG_DEBUG_SPINLOCK
+ unsigned magic;
+#endif
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
+#if defined(CONFIG_XEN) && defined(CONFIG_SMP)
+ unsigned int cpu;
+#endif
+} rwlock_t;
+
#ifdef CONFIG_DEBUG_SPINLOCK
#define SPINLOCK_MAGIC_INIT , SPINLOCK_MAGIC
#else
@@ -44,7 +70,20 @@
*/
#define spin_is_locked(x) (*(volatile signed char *)(&(x)->slock) <= 0)
+#if defined(CONFIG_XEN) && defined(CONFIG_SMP)
+#include <linux/smp.h>
+#define SPINLOCK_CPU (smp_processor_id())
+/* We only yield to the hypervisor if we are in shared processor mode */
+#define SHARED_PROCESSOR (HYPERVISOR_shared_info->shproc == 0)
+extern void __spin_yield(spinlock_t *lock);
+extern void __rw_yield(rwlock_t *rw);
+extern void spin_unlock_wait(spinlock_t *lock);
+#else
+#define __spin_yield(x) barrier()
+#define __rw_yield(x) barrier()
+#define SHARED_PROCESSOR 0
#define spin_unlock_wait(x) do { barrier(); } while(spin_is_locked(x))
+#endif
#define spin_lock_string \
"\n1:\t" \
@@ -125,6 +164,9 @@
"xchgb %b0,%1"
:"=q" (oldval), "=m" (lock->slock)
:"0" (0) : "memory");
+#if defined(CONFIG_XEN) && defined(CONFIG_SMP)
+ lock->cpu = SPINLOCK_CPU;
+#endif
return oldval > 0;
}
@@ -136,43 +178,55 @@
BUG();
}
#endif
- __asm__ __volatile__(
- spin_lock_string
- :"=m" (lock->slock) : : "memory");
+#if defined(CONFIG_XEN) && defined(CONFIG_SMP)
+ while (1) {
+ if ( likely(_raw_spin_trylock(lock)) )
+ break;
+ do {
+ cpu_relax();
+ if (SHARED_PROCESSOR)
+ __spin_yield(lock);
+ } while (likely(spin_is_locked(lock)));
+ cpu_relax();
+ }
+#else
+ __asm__ __volatile__(
+ spin_lock_string
+ :"=m" (lock->slock) : : "memory");
+#endif
}
static inline void _raw_spin_lock_flags (spinlock_t *lock, unsigned long flags)
{
+#if defined(CONFIG_XEN) && defined(CONFIG_SMP)
+ unsigned long flags_dis;
+#endif
#ifdef CONFIG_DEBUG_SPINLOCK
if (unlikely(lock->magic != SPINLOCK_MAGIC)) {
printk("eip: %p\n", __builtin_return_address(0));
BUG();
}
#endif
- __asm__ __volatile__(
- spin_lock_string_flags
- :"=m" (lock->slock) : "r" (flags) : "memory");
-}
-
-/*
- * Read-write spinlocks, allowing multiple readers
- * but only one writer.
- *
- * NOTE! it is quite common to have readers in interrupts
- * but no interrupt writers. For those circumstances we
- * can "mix" irq-safe locks - any writer needs to get a
- * irq-safe write-lock, but readers can get non-irqsafe
- * read-locks.
- */
-typedef struct {
- volatile unsigned int lock;
-#ifdef CONFIG_DEBUG_SPINLOCK
- unsigned magic;
-#endif
-#ifdef CONFIG_PREEMPT
- unsigned int break_lock;
+#if defined(CONFIG_XEN) && defined(CONFIG_SMP)
+ while (1) {
+ if ( likely(_raw_spin_trylock(lock)) )
+ break;
+ local_save_flags(flags_dis);
+ local_irq_restore(flags);
+ do {
+ cpu_relax();
+ if (SHARED_PROCESSOR)
+ __spin_yield(lock);
+ } while (likely(spin_is_locked(lock)));
+ cpu_relax();
+ local_irq_restore(flags_dis);
+ }
+#else
+ __asm__ __volatile__(
+ spin_lock_string_flags
+ :"=m" (lock->slock) : "r" (flags) : "memory");
#endif
-} rwlock_t;
+}
#define RWLOCK_MAGIC 0xdeaf1eed
@@ -198,6 +252,18 @@
*/
#define write_can_lock(x) ((x)->lock == RW_LOCK_BIAS)
+static inline int _raw_write_trylock(rwlock_t *lock)
+{
+ atomic_t *count = (atomic_t *)lock;
+ if (atomic_sub_and_test(RW_LOCK_BIAS, count)) {
+#if defined(CONFIG_XEN) && defined(CONFIG_SMP)
+ lock->cpu = SPINLOCK_CPU;
+#endif
+ return 1;
+ }
+ atomic_add(RW_LOCK_BIAS, count);
+ return 0;
+}
/*
* On x86, we implement read-write locks as a 32-bit counter
* with the high bit (sign) being the "contended" bit.
@@ -222,7 +288,20 @@
#ifdef CONFIG_DEBUG_SPINLOCK
BUG_ON(rw->magic != RWLOCK_MAGIC);
#endif
+#if defined(CONFIG_XEN) && defined(CONFIG_SMP)
+ while (1) {
+ if ( likely(_raw_write_trylock(rw)) )
+ break;
+ do {
+ cpu_relax();
+ if (SHARED_PROCESSOR)
+ __rw_yield(rw);
+ } while ( likely(!write_can_lock(rw)));
+ cpu_relax();
+ }
+#else
__build_write_lock(rw, "__write_lock_failed");
+#endif
}
#define _raw_read_unlock(rw) asm volatile("lock ; incl %0" :"=m"
((rw)->lock) : : "memory")
@@ -238,13 +317,6 @@
return 0;
}
-static inline int _raw_write_trylock(rwlock_t *lock)
-{
- atomic_t *count = (atomic_t *)lock;
- if (atomic_sub_and_test(RW_LOCK_BIAS, count))
- return 1;
- atomic_add(RW_LOCK_BIAS, count);
- return 0;
-}
+
#endif /* __ASM_SPINLOCK_H */
diff -urN b/xen/arch/x86/domain.c confer/xen/arch/x86/domain.c
--- b/xen/arch/x86/domain.c 2005-05-19 22:20:28.000000000 -0500
+++ confer/xen/arch/x86/domain.c 2005-05-20 10:38:29.187071648 -0500
@@ -253,6 +253,8 @@
memset(d->shared_info, 0, PAGE_SIZE);
ed->vcpu_info = &d->shared_info->vcpu_data[ed->vcpu_id];
ed->cpumap = CPUMAP_RUNANYWHERE;
+ /* default vcpus to sharing physical cpus */
+ d->shared_info->shproc = 1;
SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
machine_to_phys_mapping[virt_to_phys(d->shared_info) >>
PAGE_SHIFT] = INVALID_M2P_ENTRY;
diff -urN b/xen/arch/x86/x86_32/entry.S confer/xen/arch/x86/x86_32/entry.S
--- b/xen/arch/x86/x86_32/entry.S 2005-05-19 22:20:33.000000000 -0500
+++ confer/xen/arch/x86/x86_32/entry.S 2005-05-20 10:37:58.353759024 -0500
@@ -749,6 +749,7 @@
.long do_boot_vcpu
.long do_ni_hypercall /* 25 */
.long do_mmuext_op
+ .long do_confer
.rept NR_hypercalls-((.-hypercall_table)/4)
.long do_ni_hypercall
.endr
diff -urN b/xen/common/domain.c confer/xen/common/domain.c
--- b/xen/common/domain.c 2005-05-19 22:20:15.000000000 -0500
+++ confer/xen/common/domain.c 2005-05-20 10:37:58.354758872 -0500
@@ -289,6 +289,7 @@
atomic_set(&ed->pausecnt, 0);
ed->cpumap = CPUMAP_RUNANYWHERE;
+ set_bit(_VCPUF_canconfer, &ed->vcpu_flags);
memcpy(&ed->arch, &idle0_exec_domain.arch, sizeof(ed->arch));
diff -urN b/xen/common/schedule.c confer/xen/common/schedule.c
--- b/xen/common/schedule.c 2005-05-19 22:20:30.000000000 -0500
+++ confer/xen/common/schedule.c 2005-05-20 10:45:41.493351104 -0500
@@ -224,6 +224,11 @@
spin_lock_irqsave(&schedule_data[ed->processor].schedule_lock, flags);
if ( likely(domain_runnable(ed)) )
{
+ /* mark current's confer state */
+ if ( test_bit(_VCPUF_conferring, ¤t->vcpu_flags) ) {
+ clear_bit(_VCPUF_conferring, ¤t->vcpu_flags);
+ set_bit(_VCPUF_conferred, ¤t->vcpu_flags);
+ }
SCHED_OP(wake, ed);
#ifdef WAKE_HISTO
ed->wokenup = NOW();
@@ -273,6 +278,54 @@
return 0;
}
+/* Confer control to another vcpu */
+long do_confer(unsigned int vcpu, unsigned int yield_count)
+{
+ struct domain *d = current->domain;
+
+ /* Validate CONFER prereqs:
+ * - vcpu is within bounds
+ * - vcpu is a valid in this domain
+ * - current has not already conferred its slice to vcpu
+ * - vcpu is not already running
+ * - designated vcpu's yield_count matches value from call
+ *
+ * of all are ok, then set conferred value and enter scheduler
+ */
+
+ if (vcpu > MAX_VIRT_CPUS)
+ return 0;
+
+ if (d->exec_domain[vcpu] == NULL)
+ return 0;
+
+ if (!test_bit(_VCPUF_canconfer, ¤t->vcpu_flags))
+ return 0;
+
+ /* even counts indicate a running vcpu, odd is preempted/conferred */
+ /* don't confer if holder is currently running */
+ if ((d->exec_domain[vcpu]->vcpu_info->yield_count & 1) == 0)
+ return 0;
+
+ if (d->exec_domain[vcpu]->vcpu_info->yield_count != yield_count)
+ return 0;
+
+ /*
+ * set current's state to conferring, wake target
+ */
+ clear_bit(_VCPUF_canconfer, ¤t->vcpu_flags);
+ set_bit(_VCPUF_conferring, ¤t->vcpu_flags);
+ domain_wake(d->exec_domain[vcpu]);
+
+ /* request scheduling for woken domain */
+ raise_softirq(SCHEDULE_SOFTIRQ);
+
+ /* give up my timeslice */
+ do_yield();
+
+ return 0;
+}
+
/*
* Demultiplex scheduler-related hypercalls.
*/
@@ -441,7 +494,15 @@
r_time = next_slice.time;
next = next_slice.task;
-
+
+ /*
+ * always clear conferred state so this vcpu can confer during its slice
+ * since it can confer, clear all other confer state
+ */
+ set_bit(_VCPUF_canconfer, &next->vcpu_flags);
+ clear_bit(_VCPUF_conferring, &next->vcpu_flags);
+ clear_bit(_VCPUF_conferred, &next->vcpu_flags);
+
schedule_data[cpu].curr = next;
next->lastschd = now;
@@ -455,6 +516,12 @@
spin_unlock_irq(&schedule_data[cpu].schedule_lock);
+ /* bump vcpu yield_count when controlling domain is not-idle */
+ if ( !is_idle_task(prev->domain) )
+ prev->vcpu_info->yield_count++;
+ if ( !is_idle_task(next->domain) )
+ next->vcpu_info->yield_count++;
+
if ( unlikely(prev == next) ) {
#ifdef ADV_SCHED_HISTO
adv_sched_hist_to_stop(cpu);
diff -urN b/xen/include/public/xen.h confer/xen/include/public/xen.h
--- b/xen/include/public/xen.h 2005-05-19 22:20:11.000000000 -0500
+++ confer/xen/include/public/xen.h 2005-05-20 10:37:58.368756744 -0500
@@ -58,6 +58,7 @@
#define __HYPERVISOR_boot_vcpu 24
#define __HYPERVISOR_set_segment_base 25 /* x86/64 only */
#define __HYPERVISOR_mmuext_op 26
+#define __HYPERVISOR_confer 27
/*
* MULTICALLS
@@ -334,8 +335,11 @@
u8 evtchn_upcall_mask; /* 1 */
u8 pad0, pad1;
u32 evtchn_pending_sel; /* 4 */
- arch_vcpu_info_t arch; /* 8 */
-} PACKED vcpu_info_t; /* 8 + arch */
+ /* Even when vcpu is running, Odd when it is preempted/conferred */
+ u32 yield_count; /* 8 */
+ u32 pad2; /* 12 */
+ arch_vcpu_info_t arch; /* 16 */
+} PACKED vcpu_info_t; /* 16 + arch */
/*
* Xen/kernel shared data -- pointer provided in start_info.
@@ -347,6 +351,9 @@
u32 n_vcpu;
+ /* set if domains' vcpus share physical cpus */
+ int shproc;
+
/*
* A domain can have up to 1024 "event channels" on which it can send
* and receive asynchronous event notifications. There are three classes
diff -urN b/xen/include/xen/sched.h confer/xen/include/xen/sched.h
--- b/xen/include/xen/sched.h 2005-05-19 22:20:07.000000000 -0500
+++ confer/xen/include/xen/sched.h 2005-05-20 10:37:58.378755224 -0500
@@ -358,6 +358,15 @@
/* Initialization completed. */
#define _VCPUF_initialised 8
#define VCPUF_initialised (1UL<<_VCPUF_initialised)
+ /* Able to give time slice to another vcpu */
+#define _VCPUF_canconfer 9
+#define VCPUF_canconfer (1UL<<_VCPUF_canconfer)
+ /* Currently giving time slice to another vcpu */
+#define _VCPUF_conferring 10
+#define VCPUF_conferring (1UL<<_VCPUF_conferring)
+ /* Already given time slice to another vcpu */
+#define _VCPUF_conferred 11
+#define VCPUF_conferred (1UL<<_VCPUF_conferred)
/*
* Per-domain flags (domain_flags).
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|