We started to test a Linux dom0 up to 16-way (4 sockets, dual core, with
HT) and we began to see some serious scaling issues, compared to scaling
baremetal Linux to 16-way. We took some profiles and saw that functions
in xen/arch/x86/mm.c were using disproportionately more CPU time as we
scaled up the number of CPUs. Taking a quick look and those functions
(for example, like update_va_mapping and do_mmu_update) it became kind
of obvious that the locking probably does not scale. It appears we lock
domain-wide on many of thee functions. This, IMO, causes a serious
problem when running an SMP domain which happens to page fault a lot.
So, I got to thinking, just how much protection do we really need in
these functions? The OS should already provide quite a bit of
protection to page table writes. Is Xen imposing an even more
deliberate and possibly unnecessary protection here?
So I made some changes to when we lock/unlock in most of these functions
in mm.c (patch attached). Warning: I am pretty much making a shot in
the dark here. I do not know this code nearly well enough to say this
is the right thing to do. However, I can say without a doubt the
changes make a significant change in performance:
benchmark throughput increase with lock reduction
SDET 19%
reaim_shared 65%
reaim_fserver 16%
Below are per-function ratios of CPU time rev8830/rev8830-lock-reduction
(derived from oprofile diffs)
SDET:
9.84/1 restore_all_guest
1.45/1 mod_l1_entry
2.59/1 do_softirq
1.63/1 test_guest_events
1.09/1 syscall_enter
1.35/1 propagate_page_fault
1.18/1 process_guest_except
1.13/1 timer_softirq_action
1.04/1 alloc_page_type
1.05/1 revalidate_l1
1.08/1 do_set_segment_base
1.10/1 get_s_time
1.19/1 __context_switch
1.09/1 switch_to_kernel
1.11/1 FLT4
1.62/1 xen_l3_entry_update
1.27/1 xen_invlpg_mask
reaim_shared:
1.43/1 do_update_va_mapping
1.44/1 do_page_fault
1.47/1 do_mmu_update
6.75/1 restore_all_guest
1.43/1 do_mmuext_op
1.37/1 sedf_do_schedule
1.20/1 mod_l1_entry
2.46/1 do_softirq
1.27/1 t_timer_fn
1.34/1 do_set_segment_base
1.20/1 timer_softirq_action
1.24/1 process_guest_except
1.12/1 timer_interrupt
1.14/1 evtchn_send
reaim_fserver:
1.16/1 do_update_va_mapping
1.13/1 do_page_fault
8.41/1 restore_all_guest
1.17/1 do_mmu_update
1.56/1 mod_l1_entry
2.48/1 do_softirq
1.02/1 do_mmuext_op
1.14/1 sedf_do_schedule
1.12/1 t_timer_fn
1.23/1 do_set_segment_base
1.11/1 device_not_available
1.11/1 timer_softirq_action
1.13/1 process_guest_except
1.20/1 timer_interrupt
1.15/1 copy_from_user
1.11/1 propagate_page_fault
Any comments greatly appreciated.
-Andrew
<signed-off-by: habanero@xxxxxxxxxx>
diff -Naurp xen-unstable.hg-8830/xen/arch/x86/mm.c
xen-unstable.hg-8830-lockfix/xen/arch/x86/mm.c
--- xen-unstable.hg-8830/xen/arch/x86/mm.c 2006-02-15 16:30:31.000000000
-0600
+++ xen-unstable.hg-8830-lockfix/xen/arch/x86/mm.c 2006-02-15
16:32:48.000000000 -0600
@@ -1729,6 +1729,8 @@ int do_mmuext_op(
cleanup_writable_pagetable(d);
+ UNLOCK_BIGLOCK(d);
+
if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
{
count &= ~MMU_UPDATE_PREEMPTED;
@@ -1957,7 +1959,6 @@ int do_mmuext_op(
if ( unlikely(pdone != NULL) )
__put_user(done + i, pdone);
- UNLOCK_BIGLOCK(d);
return rc;
}
@@ -1982,6 +1983,8 @@ int do_mmu_update(
cleanup_writable_pagetable(d);
+ UNLOCK_BIGLOCK(d);
+
if ( unlikely(shadow_mode_enabled(d)) )
check_pagetable(v, "pre-mmu"); /* debug */
@@ -2206,7 +2209,6 @@ int do_mmu_update(
if ( unlikely(shadow_mode_enabled(d)) )
check_pagetable(v, "post-mmu"); /* debug */
- UNLOCK_BIGLOCK(d);
return rc;
}
@@ -2503,6 +2505,8 @@ int do_update_va_mapping(unsigned long v
cleanup_writable_pagetable(d);
+ UNLOCK_BIGLOCK(d);
+
if ( unlikely(shadow_mode_enabled(d)) )
check_pagetable(v, "pre-va"); /* debug */
@@ -2574,8 +2578,6 @@ int do_update_va_mapping(unsigned long v
process_deferred_ops(cpu);
- UNLOCK_BIGLOCK(d);
-
return rc;
}
@@ -2675,13 +2677,9 @@ long do_set_gdt(unsigned long *frame_lis
if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) )
return -EFAULT;
- LOCK_BIGLOCK(current->domain);
-
if ( (ret = set_gdt(current, frames, entries)) == 0 )
local_flush_tlb();
- UNLOCK_BIGLOCK(current->domain);
-
return ret;
}
@@ -2700,21 +2698,18 @@ long do_update_descriptor(u64 pa, u64 de
*(u64 *)&d = desc;
- LOCK_BIGLOCK(dom);
if ( !VALID_MFN(mfn = gmfn_to_mfn(dom, gmfn)) ||
(((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
!mfn_valid(mfn) ||
!check_descriptor(&d) )
{
- UNLOCK_BIGLOCK(dom);
return -EINVAL;
}
page = mfn_to_page(mfn);
if ( unlikely(!get_page(page, dom)) )
{
- UNLOCK_BIGLOCK(dom);
return -EINVAL;
}
@@ -2760,8 +2755,6 @@ long do_update_descriptor(u64 pa, u64 de
out:
put_page(page);
- UNLOCK_BIGLOCK(dom);
-
return ret;
}
@@ -2793,7 +2786,6 @@ long arch_memory_op(int op, void *arg)
return -ESRCH;
}
- LOCK_BIGLOCK(d);
if ( d->arch.first_reserved_pfn == 0 )
{
d->arch.first_reserved_pfn = pfn = d->max_pages;
@@ -2803,7 +2795,6 @@ long arch_memory_op(int op, void *arg)
guest_physmap_add_page(
d, pfn + 1 + i, gnttab_shared_mfn(d, d->grant_table, i));
}
- UNLOCK_BIGLOCK(d);
xrpa.first_gpfn = d->arch.first_reserved_pfn;
xrpa.nr_gpfns = 32;
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|