Jan Beulich wrote:
Storing the previous value locally is fine. But I don't think you can do with
just one 'currently spinning' pointer because of the kicking side
requirements - if an irq-save lock interrupted an non-irq one (with the
spinning pointer already set) and a remote CPU releases the lock and
wants to kick you, it won't be able to if the irq-save lock already replaced
the non-irq one. Nevertheless, if you're past the try-lock, you may end
up never getting the wakeup.
Since there can only be one non-irq and one irq-save lock a CPU is
currently spinning on (the latter as long as you don't re-enable interrupts),
two fields, otoh, are sufficient.
No, I think it's mostly OK. The sequence is:
1. mark that we're about to block
2. clear pending state on irq
3. test again with trylock
4. block in poll
The worrisome interval is between 3-4, but it's only a problem if we end
up entering the poll with the lock free and the interrupt non-pending.
There are a few possibilities for what happens in that interval:
1. the nested spinlock is fastpath only, and takes some other lock
-> OK, because we're still marked as interested in this lock, and
will be kicked
-> if the nested spinlock takes the same lock as the outer one, it
will end up doing a self-kick
2. the nested spinlock is slowpath and is kicked
-> OK, because it will leave the irq pending
3. the nested spinlock is slowpath, but picks up the lock on retry
-> bad, because it won't leave the irq pending.
The fix is to make sure that in case 4, it checks to see if it's
interrupting a blocking lock (by checking if prev != NULL), and leave
the irq pending if it is.
Updated patch below. Compile tested only.
Btw., I also think that using an xchg() (and hence a locked transaction)
for updating the pointer isn't necessary.
I was concerned about what would happen if an interrupt got between the
fetch and store. But thinking about it, I think you're right.
On an 8-core system I'm seeing between 20,000 (x86-64) and 35,000
(i686) wakeup interrupts per CPU. I'm not certain this still counts as rare.
Though that number may go down a little once the hypervisor doesn't
needlessly wake all polling vCPU-s anymore.
What workload are you seeing that on? 20-35k interrupts over what time
period?
Oh, sorry, I meant to say that's for a kernel build (-j12), taking about
400 wall seconds.
In my tests, I only see it fall into the slow path a couple of thousand
times per cpu for a kernbench run.
Hmm, that's different then for me. Actually, I see a significant spike at
modpost stage 2, when all the .ko-s get linked. The (spinlock) interrupt
rate gets up to between 1,000 and 2,000 per CPU and second.
defconfig? allmodconfig?
I'll measure again to confirm.
J
diff -r 5b4b80c08799 arch/x86/xen/spinlock.c
--- a/arch/x86/xen/spinlock.c Wed Aug 06 01:35:09 2008 -0700
+++ b/arch/x86/xen/spinlock.c Wed Aug 06 10:51:27 2008 -0700
@@ -22,6 +22,7 @@
u64 taken_slow;
u64 taken_slow_pickup;
u64 taken_slow_irqenable;
+ u64 taken_slow_spurious;
u64 released;
u64 released_slow;
@@ -135,25 +136,41 @@
static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
-static inline void spinning_lock(struct xen_spinlock *xl)
+/*
+ * Mark a cpu as interested in a lock. Returns the CPU's previous
+ * lock of interest, in case we got preempted by an interrupt.
+ */
+static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
{
+ struct xen_spinlock *prev;
+
+ prev = __get_cpu_var(lock_spinners);
__get_cpu_var(lock_spinners) = xl;
+
wmb(); /* set lock of interest before count */
+
asm(LOCK_PREFIX " incw %0"
: "+m" (xl->spinners) : : "memory");
+
+ return prev;
}
-static inline void unspinning_lock(struct xen_spinlock *xl)
+/*
+ * Mark a cpu as no longer interested in a lock. Restores previous
+ * lock of interest (NULL for none).
+ */
+static inline void unspinning_lock(struct xen_spinlock *xl, struct
xen_spinlock *prev)
{
asm(LOCK_PREFIX " decw %0"
: "+m" (xl->spinners) : : "memory");
- wmb(); /* decrement count before clearing lock */
- __get_cpu_var(lock_spinners) = NULL;
+ wmb(); /* decrement count before restoring lock */
+ __get_cpu_var(lock_spinners) = prev;
}
static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool
irq_enable)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+ struct xen_spinlock *prev;
int irq = __get_cpu_var(lock_kicker_irq);
int ret;
unsigned long flags;
@@ -163,33 +180,56 @@
return 0;
/* announce we're spinning */
- spinning_lock(xl);
+ prev = spinning_lock(xl);
+
flags = __raw_local_save_flags();
if (irq_enable) {
ADD_STATS(taken_slow_irqenable, 1);
raw_local_irq_enable();
}
- /* clear pending */
- xen_clear_irq_pending(irq);
+ ADD_STATS(taken_slow, 1);
+
+ do {
+ /* clear pending */
+ xen_clear_irq_pending(irq);
- ADD_STATS(taken_slow, 1);
+ /* check again make sure it didn't become free while
+ we weren't looking */
+ ret = xen_spin_trylock(lock);
+ if (ret) {
+ ADD_STATS(taken_slow_pickup, 1);
+
+ /*
+ * If we interrupted another spinlock while it
+ * was blocking, make sure it doesn't block
+ * without rechecking the lock.
+ */
+ if (prev != NULL)
+ xen_set_irq_pending(irq);
+ goto out;
+ }
- /* check again make sure it didn't become free while
- we weren't looking */
- ret = xen_spin_trylock(lock);
- if (ret) {
- ADD_STATS(taken_slow_pickup, 1);
- goto out;
- }
+ /*
+ * Block until irq becomes pending. If we're
+ * interrupted at this point (after the trylock but
+ * before entering the block), then the nested lock
+ * handler guarantees that the irq will be left
+ * pending if there's any chance the lock became free;
+ * xen_poll_irq() returns immediately if the irq is
+ * pending.
+ */
+ xen_poll_irq(irq);
+ kstat_this_cpu.irqs[irq]++;
+ ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
+ } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
- /* block until irq becomes pending */
- xen_poll_irq(irq);
- kstat_this_cpu.irqs[irq]++;
+ /* Leave the irq pending so that any interrupted blocker will
+ recheck. */
out:
raw_local_irq_restore(flags);
- unspinning_lock(xl);
+ unspinning_lock(xl, prev);
return ret;
}
@@ -333,6 +373,8 @@
&spinlock_stats.taken_slow_pickup);
debugfs_create_u64("taken_slow_irqenable", 0444, d_spin_debug,
&spinlock_stats.taken_slow_irqenable);
+ debugfs_create_u64("taken_slow_spurious", 0444, d_spin_debug,
+ &spinlock_stats.taken_slow_spurious);
debugfs_create_u64("released", 0444, d_spin_debug,
&spinlock_stats.released);
debugfs_create_u64("released_slow", 0444, d_spin_debug,
diff -r 5b4b80c08799 drivers/xen/events.c
--- a/drivers/xen/events.c Wed Aug 06 01:35:09 2008 -0700
+++ b/drivers/xen/events.c Wed Aug 06 10:51:27 2008 -0700
@@ -162,6 +162,12 @@
{
struct shared_info *s = HYPERVISOR_shared_info;
sync_set_bit(port, &s->evtchn_pending[0]);
+}
+
+static inline int test_evtchn(int port)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ return sync_test_bit(port, &s->evtchn_pending[0]);
}
@@ -748,6 +754,25 @@
clear_evtchn(evtchn);
}
+void xen_set_irq_pending(int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ set_evtchn(evtchn);
+}
+
+bool xen_test_irq_pending(int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+ bool ret = false;
+
+ if (VALID_EVTCHN(evtchn))
+ ret = test_evtchn(evtchn);
+
+ return ret;
+}
+
/* Poll waiting for an irq to become pending. In the usual case, the
irq will be disabled so it won't deliver an interrupt. */
void xen_poll_irq(int irq)
diff -r 5b4b80c08799 include/xen/events.h
--- a/include/xen/events.h Wed Aug 06 01:35:09 2008 -0700
+++ b/include/xen/events.h Wed Aug 06 10:51:27 2008 -0700
@@ -47,6 +47,8 @@
/* Clear an irq's pending state, in preparation for polling on it */
void xen_clear_irq_pending(int irq);
+void xen_set_irq_pending(int irq);
+bool xen_test_irq_pending(int irq);
/* Poll waiting for an irq to become pending. In the usual case, the
irq will be disabled so it won't deliver an interrupt. */
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|