Re: [Xen-devel] [PATCH] xen, xen-sparse: modify spinlocks to use

* Ryan Harper <ryanh@xxxxxxxxxx> [2005-05-20 11:55]:
> The following patch creates a new hypercall, do_confer() which allows a

Oops.  I left in fixes to my domU config which doesnt exist in the main
tree.  I've got that part removed in this version.

--
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
(512) 838-9253   T/L: 678-9253
ryanh@xxxxxxxxxx


diffstat output:
 linux-2.6.11-xen-sparse/arch/i386/lib/Makefile               |   11 
 linux-2.6.11-xen-sparse/arch/i386/lib/locks.c                |   76 +++++
 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/entry.S         |    2 
 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/hypercall.h |   16 +
 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/spinlock.h  |  140 ++++++++---
 xen/arch/x86/domain.c                                        |    2 
 xen/arch/x86/x86_32/entry.S                                  |    1 
 xen/common/domain.c                                          |    1 
 xen/common/schedule.c                                        |   69 +++++
 xen/include/public/xen.h                                     |   11 
 xen/include/xen/sched.h                                      |    9 
 11 files changed, 300 insertions(+), 38 deletions(-)

Signed-off-by: Ryan Harper <ryanh@xxxxxxxxxx>
---
diff -urN b/linux-2.6.11-xen-sparse/arch/i386/lib/locks.c 
confer/linux-2.6.11-xen-sparse/arch/i386/lib/locks.c
--- b/linux-2.6.11-xen-sparse/arch/i386/lib/locks.c     1969-12-31 
18:00:00.000000000 -0600
+++ confer/linux-2.6.11-xen-sparse/arch/i386/lib/locks.c        2005-05-20 
10:37:58.300767080 -0500
@@ -0,0 +1,76 @@
+/*
+ * Spin and read/write lock operations.
+ *
+ * Copyright (C) 2001-2004 Paul Mackerras <paulus@xxxxxxxxxx>, IBM
+ * Copyright (C) 2001 Anton Blanchard <anton@xxxxxxxxxx>, IBM
+ * Copyright (C) 2002 Dave Engebretsen <engebret@xxxxxxxxxx>, IBM
+ *   Rework to support virtual processors
+ * Copyright (C) 2005 Ryan Harper <ryanh@xxxxxxxxxx>, IBM
+ *   Rework for Xen on x86
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/stringify.h>
+#include <asm/hypercall.h>
+#include <asm/processor.h>
+
+/* waiting for a spinlock... */
+#if defined(CONFIG_XEN) && defined(CONFIG_SMP)
+void __spin_yield(spinlock_t *lock)
+{
+       unsigned int lock_value, holder_cpu, yield_count;
+   shared_info_t *s = HYPERVISOR_shared_info;
+
+       lock_value = lock->slock;
+       if (lock_value == 1)
+               return;
+       holder_cpu = lock->cpu;
+       BUG_ON(holder_cpu >= NR_CPUS);
+       yield_count = s->vcpu_data[holder_cpu].yield_count;
+       if ((yield_count & 1) == 0)
+               return;         /* virtual cpu is currently running */
+       rmb();
+       if (lock->slock != lock_value)
+               return;         /* something has changed */
+   HYPERVISOR_confer(holder_cpu, yield_count);
+}
+
+void __rw_yield(rwlock_t *rw)
+{
+       unsigned int lock_value, holder_cpu, yield_count;
+   shared_info_t *s = HYPERVISOR_shared_info;
+
+       lock_value = rw->lock;
+       if (lock_value == RW_LOCK_BIAS)
+               return;
+       holder_cpu = rw->cpu;
+       BUG_ON(holder_cpu >= NR_CPUS);
+       yield_count = s->vcpu_data[holder_cpu].yield_count;
+       if ((yield_count & 1) == 0)
+               return;         /* virtual cpu is currently running */
+       rmb();
+       if (rw->lock != lock_value)
+               return;         /* something has changed */
+   HYPERVISOR_confer(holder_cpu, yield_count);
+}
+
+void spin_unlock_wait(spinlock_t *lock)
+{
+       while (spin_is_locked(lock)) {
+               cpu_relax();
+               if (SHARED_PROCESSOR)
+                       __spin_yield(lock);
+       }
+       cpu_relax();
+}
+EXPORT_SYMBOL(spin_unlock_wait);
+#endif
diff -urN b/linux-2.6.11-xen-sparse/arch/i386/lib/Makefile 
confer/linux-2.6.11-xen-sparse/arch/i386/lib/Makefile
--- b/linux-2.6.11-xen-sparse/arch/i386/lib/Makefile    1969-12-31 
18:00:00.000000000 -0600
+++ confer/linux-2.6.11-xen-sparse/arch/i386/lib/Makefile       2005-05-20 
10:37:58.301766928 -0500
@@ -0,0 +1,11 @@
+#
+# Makefile for i386-specific library files..
+#
+
+
+lib-y = checksum.o delay.o usercopy.o getuser.o memcpy.o strstr.o \
+       bitops.o 
+
+lib-$(CONFIG_X86_USE_3DNOW) += mmx.o
+lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
+lib-$(CONFIG_XEN) += locks.o
diff -urN b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/entry.S 
confer/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/entry.S
--- b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/entry.S      2005-05-19 
22:20:32.000000000 -0500
+++ confer/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/entry.S 2005-05-20 
10:37:58.304766472 -0500
@@ -80,7 +80,7 @@
 #define evtchn_upcall_pending          /* 0 */
 #define evtchn_upcall_mask             1
 
-#define sizeof_vcpu_shift              3
+#define sizeof_vcpu_shift              4
 
 #ifdef CONFIG_SMP
 #define preempt_disable(reg)   incl TI_preempt_count(reg)
diff -urN b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/hypercall.h 
confer/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/hypercall.h
--- b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/hypercall.h      
2005-05-19 22:20:32.000000000 -0500
+++ confer/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/hypercall.h 
2005-05-20 10:37:58.306766168 -0500
@@ -517,4 +517,20 @@
     return ret;
 }
 
+static inline int
+HYPERVISOR_confer(
+       unsigned int vcpu, unsigned int yield_count)
+{
+    int ret;
+    unsigned long ign1, ign2;
+
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1), "=c" (ign2)
+       : "0" (__HYPERVISOR_confer), "1" (vcpu), "2" (yield_count)
+       : "memory");
+
+    return ret;
+}
+
 #endif /* __HYPERCALL_H__ */
diff -urN b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/spinlock.h 
confer/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/spinlock.h
--- b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/spinlock.h       
2005-05-19 22:20:14.000000000 -0500
+++ confer/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/spinlock.h  
2005-05-20 10:37:58.307766016 -0500
@@ -22,10 +22,36 @@
 #ifdef CONFIG_PREEMPT
        unsigned int break_lock;
 #endif
+#if defined(CONFIG_XEN) && defined(CONFIG_SMP)
+       unsigned int cpu;
+#endif
 } spinlock_t;
 
 #define SPINLOCK_MAGIC 0xdead4ead
 
+/*
+ * Read-write spinlocks, allowing multiple readers
+ * but only one writer.
+ *
+ * NOTE! it is quite common to have readers in interrupts
+ * but no interrupt writers. For those circumstances we
+ * can "mix" irq-safe locks - any writer needs to get a
+ * irq-safe write-lock, but readers can get non-irqsafe
+ * read-locks.
+ */
+typedef struct {
+       volatile unsigned int lock;
+#ifdef CONFIG_DEBUG_SPINLOCK
+       unsigned magic;
+#endif
+#ifdef CONFIG_PREEMPT
+       unsigned int break_lock;
+#endif
+#if defined(CONFIG_XEN) && defined(CONFIG_SMP)
+       unsigned int cpu;
+#endif
+} rwlock_t;
+
 #ifdef CONFIG_DEBUG_SPINLOCK
 #define SPINLOCK_MAGIC_INIT    , SPINLOCK_MAGIC
 #else
@@ -44,7 +70,20 @@
  */
 
 #define spin_is_locked(x)      (*(volatile signed char *)(&(x)->slock) <= 0)
+#if defined(CONFIG_XEN) && defined(CONFIG_SMP)
+#include <linux/smp.h>
+#define SPINLOCK_CPU (smp_processor_id())
+/* We only yield to the hypervisor if we are in shared processor mode */
+#define SHARED_PROCESSOR (HYPERVISOR_shared_info->shproc == 0)
+extern void __spin_yield(spinlock_t *lock);
+extern void __rw_yield(rwlock_t *rw);
+extern void spin_unlock_wait(spinlock_t *lock);
+#else
+#define __spin_yield(x) barrier()
+#define __rw_yield(x) barrier()
+#define SHARED_PROCESSOR 0
 #define spin_unlock_wait(x)    do { barrier(); } while(spin_is_locked(x))
+#endif
 
 #define spin_lock_string \
        "\n1:\t" \
@@ -125,6 +164,9 @@
                "xchgb %b0,%1"
                :"=q" (oldval), "=m" (lock->slock)
                :"0" (0) : "memory");
+#if defined(CONFIG_XEN) && defined(CONFIG_SMP)
+       lock->cpu = SPINLOCK_CPU;
+#endif
        return oldval > 0;
 }
 
@@ -136,43 +178,55 @@
                BUG();
        }
 #endif
-       __asm__ __volatile__(
-               spin_lock_string
-               :"=m" (lock->slock) : : "memory");
+#if defined(CONFIG_XEN) && defined(CONFIG_SMP)
+       while (1) {
+               if ( likely(_raw_spin_trylock(lock)) )
+                       break;
+               do {
+                       cpu_relax();
+                       if (SHARED_PROCESSOR)
+                               __spin_yield(lock);
+               } while (likely(spin_is_locked(lock)));
+               cpu_relax();
+       }
+#else
+   __asm__ __volatile__(
+      spin_lock_string
+      :"=m" (lock->slock) : : "memory");
+#endif
 }
 
 static inline void _raw_spin_lock_flags (spinlock_t *lock, unsigned long flags)
 {
+#if defined(CONFIG_XEN) && defined(CONFIG_SMP)
+       unsigned long flags_dis;
+#endif
 #ifdef CONFIG_DEBUG_SPINLOCK
        if (unlikely(lock->magic != SPINLOCK_MAGIC)) {
                printk("eip: %p\n", __builtin_return_address(0));
                BUG();
        }
 #endif
-       __asm__ __volatile__(
-               spin_lock_string_flags
-               :"=m" (lock->slock) : "r" (flags) : "memory");
-}
-
-/*
- * Read-write spinlocks, allowing multiple readers
- * but only one writer.
- *
- * NOTE! it is quite common to have readers in interrupts
- * but no interrupt writers. For those circumstances we
- * can "mix" irq-safe locks - any writer needs to get a
- * irq-safe write-lock, but readers can get non-irqsafe
- * read-locks.
- */
-typedef struct {
-       volatile unsigned int lock;
-#ifdef CONFIG_DEBUG_SPINLOCK
-       unsigned magic;
-#endif
-#ifdef CONFIG_PREEMPT
-       unsigned int break_lock;
+#if defined(CONFIG_XEN) && defined(CONFIG_SMP)
+       while (1) {
+               if ( likely(_raw_spin_trylock(lock)) )
+                       break;
+               local_save_flags(flags_dis);
+               local_irq_restore(flags);
+               do {
+                       cpu_relax();
+                       if (SHARED_PROCESSOR)
+                               __spin_yield(lock);
+               } while (likely(spin_is_locked(lock)));
+               cpu_relax();
+               local_irq_restore(flags_dis);
+       }
+#else
+   __asm__ __volatile__(
+      spin_lock_string_flags
+      :"=m" (lock->slock) : "r" (flags) : "memory");
 #endif
-} rwlock_t;
+}
 
 #define RWLOCK_MAGIC   0xdeaf1eed
 
@@ -198,6 +252,18 @@
  */
 #define write_can_lock(x) ((x)->lock == RW_LOCK_BIAS)
 
+static inline int _raw_write_trylock(rwlock_t *lock)
+{
+       atomic_t *count = (atomic_t *)lock;
+       if (atomic_sub_and_test(RW_LOCK_BIAS, count)) {
+#if defined(CONFIG_XEN) && defined(CONFIG_SMP)
+      lock->cpu = SPINLOCK_CPU;
+#endif
+               return 1;
+   }
+       atomic_add(RW_LOCK_BIAS, count);
+       return 0;
+}
 /*
  * On x86, we implement read-write locks as a 32-bit counter
  * with the high bit (sign) being the "contended" bit.
@@ -222,7 +288,20 @@
 #ifdef CONFIG_DEBUG_SPINLOCK
        BUG_ON(rw->magic != RWLOCK_MAGIC);
 #endif
+#if defined(CONFIG_XEN) && defined(CONFIG_SMP)
+       while (1) {
+               if ( likely(_raw_write_trylock(rw)) )
+                       break;
+               do {
+                       cpu_relax();
+                       if (SHARED_PROCESSOR)
+                               __rw_yield(rw);
+               } while ( likely(!write_can_lock(rw)));
+               cpu_relax();
+       }
+#else
        __build_write_lock(rw, "__write_lock_failed");
+#endif
 }
 
 #define _raw_read_unlock(rw)           asm volatile("lock ; incl %0" :"=m" 
((rw)->lock) : : "memory")
@@ -238,13 +317,6 @@
        return 0;
 }
 
-static inline int _raw_write_trylock(rwlock_t *lock)
-{
-       atomic_t *count = (atomic_t *)lock;
-       if (atomic_sub_and_test(RW_LOCK_BIAS, count))
-               return 1;
-       atomic_add(RW_LOCK_BIAS, count);
-       return 0;
-}
+
 
 #endif /* __ASM_SPINLOCK_H */
diff -urN b/xen/arch/x86/domain.c confer/xen/arch/x86/domain.c
--- b/xen/arch/x86/domain.c     2005-05-19 22:20:28.000000000 -0500
+++ confer/xen/arch/x86/domain.c        2005-05-20 10:38:29.187071648 -0500
@@ -253,6 +253,8 @@
     memset(d->shared_info, 0, PAGE_SIZE);
     ed->vcpu_info = &d->shared_info->vcpu_data[ed->vcpu_id];
     ed->cpumap = CPUMAP_RUNANYWHERE;
+    /* default vcpus to sharing physical cpus */
+    d->shared_info->shproc = 1;
     SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
     machine_to_phys_mapping[virt_to_phys(d->shared_info) >> 
                            PAGE_SHIFT] = INVALID_M2P_ENTRY;
diff -urN b/xen/arch/x86/x86_32/entry.S confer/xen/arch/x86/x86_32/entry.S
--- b/xen/arch/x86/x86_32/entry.S       2005-05-19 22:20:33.000000000 -0500
+++ confer/xen/arch/x86/x86_32/entry.S  2005-05-20 10:37:58.353759024 -0500
@@ -749,6 +749,7 @@
         .long do_boot_vcpu
         .long do_ni_hypercall       /* 25 */
         .long do_mmuext_op
+        .long do_confer
         .rept NR_hypercalls-((.-hypercall_table)/4)
         .long do_ni_hypercall
         .endr
diff -urN b/xen/common/domain.c confer/xen/common/domain.c
--- b/xen/common/domain.c       2005-05-19 22:20:15.000000000 -0500
+++ confer/xen/common/domain.c  2005-05-20 10:37:58.354758872 -0500
@@ -289,6 +289,7 @@
 
     atomic_set(&ed->pausecnt, 0);
     ed->cpumap = CPUMAP_RUNANYWHERE;
+    set_bit(_VCPUF_canconfer, &ed->vcpu_flags);
 
     memcpy(&ed->arch, &idle0_exec_domain.arch, sizeof(ed->arch));
 
diff -urN b/xen/common/schedule.c confer/xen/common/schedule.c
--- b/xen/common/schedule.c     2005-05-19 22:20:30.000000000 -0500
+++ confer/xen/common/schedule.c        2005-05-20 10:45:41.493351104 -0500
@@ -224,6 +224,11 @@
     spin_lock_irqsave(&schedule_data[ed->processor].schedule_lock, flags);
     if ( likely(domain_runnable(ed)) )
     {
+        /* mark current's confer state */
+        if ( test_bit(_VCPUF_conferring, &current->vcpu_flags) ) {
+            clear_bit(_VCPUF_conferring, &current->vcpu_flags);
+            set_bit(_VCPUF_conferred, &current->vcpu_flags);
+        }
         SCHED_OP(wake, ed);
 #ifdef WAKE_HISTO
         ed->wokenup = NOW();
@@ -273,6 +278,54 @@
     return 0;
 }
 
+/* Confer control to another vcpu */
+long do_confer(unsigned int vcpu, unsigned int yield_count)
+{
+    struct domain *d = current->domain;
+   
+    /* Validate CONFER prereqs:
+    * - vcpu is within bounds
+    * - vcpu is a valid in this domain
+    * - current has not already conferred its slice to vcpu
+    * - vcpu is not already running
+    * - designated vcpu's yield_count matches value from call
+    *
+    * of all are ok, then set conferred value and enter scheduler
+    */
+
+    if (vcpu > MAX_VIRT_CPUS)
+        return 0; 
+
+    if (d->exec_domain[vcpu] == NULL)
+        return 0;
+
+    if (!test_bit(_VCPUF_canconfer, &current->vcpu_flags))
+        return 0;
+
+    /* even counts indicate a running vcpu, odd is preempted/conferred */
+    /* don't confer if holder is currently running */
+    if ((d->exec_domain[vcpu]->vcpu_info->yield_count & 1) == 0)
+        return 0;
+
+    if (d->exec_domain[vcpu]->vcpu_info->yield_count != yield_count)
+        return 0;
+
+    /*
+     * set current's state to conferring, wake target
+     */
+    clear_bit(_VCPUF_canconfer, &current->vcpu_flags);
+    set_bit(_VCPUF_conferring, &current->vcpu_flags);
+    domain_wake(d->exec_domain[vcpu]);
+
+    /* request scheduling for woken domain */
+    raise_softirq(SCHEDULE_SOFTIRQ);
+
+    /* give up my timeslice */
+    do_yield();
+
+    return 0;
+}
+
 /*
  * Demultiplex scheduler-related hypercalls.
  */
@@ -441,7 +494,15 @@
 
     r_time = next_slice.time;
     next = next_slice.task;
-    
+
+    /*
+     * always clear conferred state so this vcpu can confer during its slice
+     * since it can confer, clear all other confer state
+     */
+    set_bit(_VCPUF_canconfer,   &next->vcpu_flags);
+    clear_bit(_VCPUF_conferring, &next->vcpu_flags);
+    clear_bit(_VCPUF_conferred,  &next->vcpu_flags);
+
     schedule_data[cpu].curr = next;
     
     next->lastschd = now;
@@ -455,6 +516,12 @@
 
     spin_unlock_irq(&schedule_data[cpu].schedule_lock);
 
+    /* bump vcpu yield_count when controlling domain is not-idle */
+    if ( !is_idle_task(prev->domain) )
+        prev->vcpu_info->yield_count++;
+    if ( !is_idle_task(next->domain) )
+        next->vcpu_info->yield_count++;
+
     if ( unlikely(prev == next) ) {
 #ifdef ADV_SCHED_HISTO
         adv_sched_hist_to_stop(cpu);
diff -urN b/xen/include/public/xen.h confer/xen/include/public/xen.h
--- b/xen/include/public/xen.h  2005-05-19 22:20:11.000000000 -0500
+++ confer/xen/include/public/xen.h     2005-05-20 10:37:58.368756744 -0500
@@ -58,6 +58,7 @@
 #define __HYPERVISOR_boot_vcpu            24
 #define __HYPERVISOR_set_segment_base     25 /* x86/64 only */
 #define __HYPERVISOR_mmuext_op            26
+#define __HYPERVISOR_confer               27
 
 /*
  * MULTICALLS
@@ -334,8 +335,11 @@
     u8 evtchn_upcall_mask;              /* 1 */
     u8 pad0, pad1;
     u32 evtchn_pending_sel;             /* 4 */
-    arch_vcpu_info_t arch;              /* 8 */
-} PACKED vcpu_info_t;                   /* 8 + arch */
+    /* Even when vcpu is running, Odd when it is preempted/conferred */
+    u32 yield_count;                    /* 8 */
+    u32 pad2;                           /* 12 */
+    arch_vcpu_info_t arch;              /* 16 */
+} PACKED vcpu_info_t;                   /* 16 + arch */
 
 /*
  * Xen/kernel shared data -- pointer provided in start_info.
@@ -347,6 +351,9 @@
 
     u32 n_vcpu;
 
+    /* set if domains' vcpus share physical cpus */
+    int shproc; 
+
     /*
      * A domain can have up to 1024 "event channels" on which it can send
      * and receive asynchronous event notifications. There are three classes
diff -urN b/xen/include/xen/sched.h confer/xen/include/xen/sched.h
--- b/xen/include/xen/sched.h   2005-05-19 22:20:07.000000000 -0500
+++ confer/xen/include/xen/sched.h      2005-05-20 10:37:58.378755224 -0500
@@ -358,6 +358,15 @@
  /* Initialization completed. */
 #define _VCPUF_initialised     8
 #define VCPUF_initialised      (1UL<<_VCPUF_initialised)
+ /* Able to give time slice to another vcpu */
+#define _VCPUF_canconfer       9
+#define VCPUF_canconfer        (1UL<<_VCPUF_canconfer)
+ /* Currently giving time slice to another vcpu */
+#define _VCPUF_conferring      10
+#define VCPUF_conferring       (1UL<<_VCPUF_conferring)
+ /* Already given time slice to another vcpu */
+#define _VCPUF_conferred       11
+#define VCPUF_conferred        (1UL<<_VCPUF_conferred)
 
 /*
  * Per-domain flags (domain_flags).

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
WARNING - OLD ARCHIVES

xen-devel

Re: [Xen-devel] [PATCH] xen, xen-sparse: modify spinlocks to use directe