# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1264019615 0
# Node ID fad80160c0012182006eb80ae55532ab09630db0
# Parent b0b41e735575ec6fc1d7d56f3cffa9246058881a
xentrace: Per-cpu xentrace buffers
In the current xentrace configuration, xentrace buffers are all
allocated in a single contiguous chunk, and then divided among logical
cpus, one buffer per cpu. The size of an allocatable chunk is fairly
limited, in my experience about 128 pages (512KiB). As the number of
logical cores increase, this means a much smaller maximum per-cpu
trace buffer per cpu; on my dual-socket quad-core nehalem box with
hyperthreading (16 logical cpus), that comes to 8 pages per logical
cpu.
This patch addresses this issue by allocating per-cpu buffers
separately.
Signed-off-by: George Dunlap <dunlapg@xxxxxxxxx>
---
tools/xentrace/xentrace.c | 137 +++++++++++++++++---------------------
xen/common/trace.c | 158 ++++++++++++++++++++++++++++++++++----------
xen/include/public/sysctl.h | 2
xen/include/public/trace.h | 10 ++
4 files changed, 197 insertions(+), 110 deletions(-)
diff -r b0b41e735575 -r fad80160c001 tools/xentrace/xentrace.c
--- a/tools/xentrace/xentrace.c Wed Jan 20 09:51:38 2010 +0000
+++ b/tools/xentrace/xentrace.c Wed Jan 20 20:33:35 2010 +0000
@@ -60,6 +60,12 @@ typedef struct settings_st {
uint8_t discard:1,
disable_tracing:1;
} settings_t;
+
+struct t_struct {
+ struct t_info *t_info; /* Structure with information about individual
buffers */
+ struct t_buf **meta; /* Pointers to trace buffer metadata */
+ unsigned char **data; /* Pointers to trace buffer data areas */
+};
settings_t opts;
@@ -446,22 +452,61 @@ static void get_tbufs(unsigned long *mfn
*
* Maps the Xen trace buffers them into process address space.
*/
-static struct t_buf *map_tbufs(unsigned long tbufs_mfn, unsigned int num,
- unsigned long size)
-{
- struct t_buf *tbufs_mapped;
-
- tbufs_mapped = xc_map_foreign_range(xc_handle, DOMID_XEN,
- size * num, PROT_READ | PROT_WRITE,
+static struct t_struct *map_tbufs(unsigned long tbufs_mfn, unsigned int num,
+ unsigned long tinfo_size)
+{
+ static struct t_struct tbufs = { 0 };
+ int i;
+
+ /* Map t_info metadata structure */
+ tbufs.t_info = xc_map_foreign_range(xc_handle, DOMID_XEN,
+ tinfo_size, PROT_READ | PROT_WRITE,
tbufs_mfn);
- if ( tbufs_mapped == 0 )
+ if ( tbufs.t_info == 0 )
{
PERROR("Failed to mmap trace buffers");
exit(EXIT_FAILURE);
}
- return tbufs_mapped;
+ if ( tbufs.t_info->tbuf_size == 0 )
+ {
+ fprintf(stderr, "%s: tbuf_size 0!\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+ /* Map per-cpu buffers */
+ tbufs.meta = (struct t_buf **)calloc(num, sizeof(struct t_buf *));
+ tbufs.data = (unsigned char **)calloc(num, sizeof(unsigned char *));
+ if ( tbufs.meta == NULL || tbufs.data == NULL )
+ {
+ PERROR( "Failed to allocate memory for buffer pointers\n");
+ exit(EXIT_FAILURE);
+ }
+
+ for(i=0; i<num; i++)
+ {
+
+ uint32_t *mfn_list = ((uint32_t *)tbufs.t_info) +
tbufs.t_info->mfn_offset[i];
+ int j;
+ xen_pfn_t pfn_list[tbufs.t_info->tbuf_size];
+
+ for ( j=0; j<tbufs.t_info->tbuf_size; j++)
+ pfn_list[j] = (xen_pfn_t)mfn_list[j];
+
+ tbufs.meta[i] = xc_map_foreign_batch(xc_handle, DOMID_XEN,
+ PROT_READ | PROT_WRITE,
+ pfn_list,
+ tbufs.t_info->tbuf_size);
+ if ( tbufs.meta[i] == NULL )
+ {
+ PERROR("Failed to map cpu buffer!");
+ exit(EXIT_FAILURE);
+ }
+ tbufs.data[i] = (unsigned char *)(tbufs.meta[i]+1);
+ }
+
+ return &tbufs;
}
/**
@@ -490,66 +535,6 @@ static void set_mask(uint32_t mask, int
}
/**
- * init_bufs_ptrs - initialises an array of pointers to the trace buffers
- * @bufs_mapped: the userspace address where the trace buffers are mapped
- * @num: number of trace buffers
- * @size: trace buffer size
- *
- * Initialises an array of pointers to individual trace buffers within the
- * mapped region containing all trace buffers.
- */
-static struct t_buf **init_bufs_ptrs(void *bufs_mapped, unsigned int num,
- unsigned long size)
-{
- int i;
- struct t_buf **user_ptrs;
-
- user_ptrs = (struct t_buf **)calloc(num, sizeof(struct t_buf *));
- if ( user_ptrs == NULL )
- {
- PERROR( "Failed to allocate memory for buffer pointers\n");
- exit(EXIT_FAILURE);
- }
-
- /* initialise pointers to the trace buffers - given the size of a trace
- * buffer and the value of bufs_maped, we can easily calculate these */
- for ( i = 0; i<num; i++ )
- user_ptrs[i] = (struct t_buf *)((unsigned long)bufs_mapped + size * i);
-
- return user_ptrs;
-}
-
-
-/**
- * init_rec_ptrs - initialises data area pointers to locations in user space
- * @tbufs_mfn: base mfn of the trace buffer area
- * @tbufs_mapped: user virtual address of base of trace buffer area
- * @meta: array of user-space pointers to struct t_buf's of metadata
- * @num: number of trace buffers
- *
- * Initialises data area pointers to the locations that data areas have been
- * mapped in user space. Note that the trace buffer metadata contains machine
- * pointers - the array returned allows more convenient access to them.
- */
-static unsigned char **init_rec_ptrs(struct t_buf **meta, unsigned int num)
-{
- int i;
- unsigned char **data;
-
- data = calloc(num, sizeof(unsigned char *));
- if ( data == NULL )
- {
- PERROR("Failed to allocate memory for data pointers\n");
- exit(EXIT_FAILURE);
- }
-
- for ( i = 0; i < num; i++ )
- data[i] = (unsigned char *)(meta[i] + 1);
-
- return data;
-}
-
-/**
* get_num_cpus - get the number of logical CPUs
*/
static unsigned int get_num_cpus(void)
@@ -638,12 +623,13 @@ static int monitor_tbufs(void)
{
int i;
- void *tbufs_mapped; /* pointer to where the tbufs are mapped */
+ struct t_struct *tbufs; /* Pointer to hypervisor maps */
struct t_buf **meta; /* pointers to the trace buffer metadata */
unsigned char **data; /* pointers to the trace buffer data areas
* where they are mapped into user space. */
unsigned long tbufs_mfn; /* mfn of the tbufs */
unsigned int num; /* number of trace buffers / logical CPUS */
+ unsigned long tinfo_size; /* size of t_info metadata map */
unsigned long size; /* size of a single trace buffer */
unsigned long data_size;
@@ -655,14 +641,15 @@ static int monitor_tbufs(void)
num = get_num_cpus();
/* setup access to trace buffers */
- get_tbufs(&tbufs_mfn, &size);
- tbufs_mapped = map_tbufs(tbufs_mfn, num, size);
+ get_tbufs(&tbufs_mfn, &tinfo_size);
+ tbufs = map_tbufs(tbufs_mfn, num, tinfo_size);
+
+ size = tbufs->t_info->tbuf_size * PAGE_SIZE;
data_size = size - sizeof(struct t_buf);
- /* build arrays of convenience ptrs */
- meta = init_bufs_ptrs(tbufs_mapped, num, size);
- data = init_rec_ptrs(meta, num);
+ meta = tbufs->meta;
+ data = tbufs->data;
if ( opts.discard )
for ( i = 0; i < num; i++ )
diff -r b0b41e735575 -r fad80160c001 xen/common/trace.c
--- a/xen/common/trace.c Wed Jan 20 09:51:38 2010 +0000
+++ b/xen/common/trace.c Wed Jan 20 20:33:35 2010 +0000
@@ -46,8 +46,11 @@ integer_param("tbuf_size", opt_tbuf_size
integer_param("tbuf_size", opt_tbuf_size);
/* Pointers to the meta-data objects for all system trace buffers */
+static struct t_info *t_info;
+#define T_INFO_PAGES 2 /* Size fixed at 2 pages for now. */
static DEFINE_PER_CPU_READ_MOSTLY(struct t_buf *, t_bufs);
static DEFINE_PER_CPU_READ_MOSTLY(unsigned char *, t_data);
+static DEFINE_PER_CPU_READ_MOSTLY(spinlock_t, t_lock);
static int data_size;
/* High water mark for trace buffers; */
@@ -80,41 +83,104 @@ static u32 tb_event_mask = TRC_ALL;
*/
static int alloc_trace_bufs(void)
{
- int i, order;
+ int i, cpu, order;
unsigned long nr_pages;
- char *rawbuf;
- struct t_buf *buf;
+ /* Start after a fixed-size array of NR_CPUS */
+ uint32_t *t_info_mfn_list = (uint32_t *)t_info;
+ int offset = (NR_CPUS * 2 + 1 + 1) / 4;
if ( opt_tbuf_size == 0 )
return -EINVAL;
- nr_pages = num_online_cpus() * opt_tbuf_size;
- order = get_order_from_pages(nr_pages);
+ if ( !t_info )
+ {
+ printk("%s: t_info not allocated, cannot allocate trace buffers!\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ t_info->tbuf_size = opt_tbuf_size;
+ printk("tbuf_size %d\n", t_info->tbuf_size);
+
+ nr_pages = opt_tbuf_size;
+ order = get_order_from_pages(nr_pages);
+
+ /*
+ * First, allocate buffers for all of the cpus. If any
+ * fails, deallocate what you have so far and exit.
+ */
+ for_each_online_cpu(cpu)
+ {
+ int flags;
+ char *rawbuf;
+ struct t_buf *buf;
+
+ if ( (rawbuf = alloc_xenheap_pages(order, 0)) == NULL )
+ {
+ printk("Xen trace buffers: memory allocation failed\n");
+ opt_tbuf_size = 0;
+ goto out_dealloc;
+ }
+
+ spin_lock_irqsave(&per_cpu(t_lock, cpu), flags);
+
+ buf = per_cpu(t_bufs, cpu) = (struct t_buf *)rawbuf;
+ buf->cons = buf->prod = 0;
+ per_cpu(t_data, cpu) = (unsigned char *)(buf + 1);
+
+ spin_unlock_irqrestore(&per_cpu(t_lock, cpu), flags);
+
+ }
+
+ /*
+ * Now share the pages to xentrace can map them, and write them in
+ * the global t_info structure.
+ */
+ for_each_online_cpu(cpu)
+ {
+ /* Share pages so that xentrace can map them. */
+ char *rawbuf;
+
+ if ( (rawbuf = (char *)per_cpu(t_bufs, cpu)) )
+ {
+ struct page_info *p = virt_to_page(rawbuf);
+ uint32_t mfn = virt_to_mfn(rawbuf);
+
+ for ( i = 0; i < nr_pages; i++ )
+ {
+ share_xen_page_with_privileged_guests(
+ p + i, XENSHARE_writable);
+
+ t_info_mfn_list[offset + i]=mfn + i;
+ }
+ /* Write list first, then write per-cpu offset. */
+ wmb();
+ t_info->mfn_offset[cpu]=offset;
+ printk("p%d mfn %"PRIx32" offset %d\n",
+ cpu, mfn, offset);
+ offset+=i;
+ }
+ }
+
data_size = (opt_tbuf_size * PAGE_SIZE - sizeof(struct t_buf));
-
- if ( (rawbuf = alloc_xenheap_pages(order, 0)) == NULL )
- {
- printk("Xen trace buffers: memory allocation failed\n");
- opt_tbuf_size = 0;
- return -EINVAL;
- }
-
- /* Share pages so that xentrace can map them. */
- for ( i = 0; i < nr_pages; i++ )
- share_xen_page_with_privileged_guests(
- virt_to_page(rawbuf) + i, XENSHARE_writable);
-
- for_each_online_cpu ( i )
- {
- buf = per_cpu(t_bufs, i) = (struct t_buf *)
- &rawbuf[i*opt_tbuf_size*PAGE_SIZE];
- buf->cons = buf->prod = 0;
- per_cpu(t_data, i) = (unsigned char *)(buf + 1);
- }
-
t_buf_highwater = data_size >> 1; /* 50% high water */
return 0;
+out_dealloc:
+ for_each_online_cpu(cpu)
+ {
+ int flags;
+ char * rawbuf;
+
+ spin_lock_irqsave(&per_cpu(t_lock, cpu), flags);
+ if ( (rawbuf = (char *)per_cpu(t_bufs, cpu)) )
+ {
+ ASSERT(!(virt_to_page(rawbuf)->count_info & PGC_allocated));
+ free_xenheap_pages(rawbuf, order);
+ }
+ spin_unlock_irqrestore(&per_cpu(t_lock, cpu), flags);
+ }
+ return -EINVAL;
}
@@ -181,6 +247,26 @@ int trace_will_trace_event(u32 event)
*/
void __init init_trace_bufs(void)
{
+ int i;
+ /* t_info size fixed at 2 pages for now. That should be big enough /
small enough
+ * until it's worth making it dynamic. */
+ t_info = alloc_xenheap_pages(1, 0);
+
+ if ( t_info == NULL )
+ {
+ printk("Xen trace buffers: t_info allocation failed! Tracing
disabled.\n");
+ return;
+ }
+
+ for(i = 0; i < NR_CPUS; i++)
+ spin_lock_init(&per_cpu(t_lock, i));
+
+ for(i=0; i<T_INFO_PAGES; i++)
+ share_xen_page_with_privileged_guests(
+ virt_to_page(t_info) + i, XENSHARE_writable);
+
+
+
if ( opt_tbuf_size == 0 )
{
printk("Xen trace buffers: disabled\n");
@@ -210,8 +296,8 @@ int tb_control(xen_sysctl_tbuf_op_t *tbc
{
case XEN_SYSCTL_TBUFOP_get_info:
tbc->evt_mask = tb_event_mask;
- tbc->buffer_mfn = opt_tbuf_size ? virt_to_mfn(per_cpu(t_bufs, 0)) : 0;
- tbc->size = opt_tbuf_size * PAGE_SIZE;
+ tbc->buffer_mfn = t_info ? virt_to_mfn(t_info) : 0;
+ tbc->size = T_INFO_PAGES;
break;
case XEN_SYSCTL_TBUFOP_set_cpu_mask:
xenctl_cpumap_to_cpumask(&tb_cpu_mask, &tbc->cpu_mask);
@@ -220,7 +306,7 @@ int tb_control(xen_sysctl_tbuf_op_t *tbc
tb_event_mask = tbc->evt_mask;
break;
case XEN_SYSCTL_TBUFOP_set_size:
- rc = !tb_init_done ? tb_set_size(tbc->size) : -EINVAL;
+ rc = tb_set_size(tbc->size);
break;
case XEN_SYSCTL_TBUFOP_enable:
/* Enable trace buffers. Check buffers are already allocated. */
@@ -428,7 +514,7 @@ void __trace_var(u32 event, int cycles,
unsigned long flags, bytes_to_tail, bytes_to_wrap;
int rec_size, total_size;
int extra_word;
- int started_below_highwater;
+ int started_below_highwater = 0;
if( !tb_init_done )
return;
@@ -462,9 +548,12 @@ void __trace_var(u32 event, int cycles,
/* Read tb_init_done /before/ t_bufs. */
rmb();
+ spin_lock_irqsave(&this_cpu(t_lock), flags);
+
buf = this_cpu(t_bufs);
- local_irq_save(flags);
+ if ( unlikely(!buf) )
+ goto unlock;
started_below_highwater = (calc_unconsumed_bytes(buf) < t_buf_highwater);
@@ -511,8 +600,8 @@ void __trace_var(u32 event, int cycles,
{
if ( ++this_cpu(lost_records) == 1 )
this_cpu(lost_records_first_tsc)=(u64)get_cycles();
- local_irq_restore(flags);
- return;
+ started_below_highwater = 0;
+ goto unlock;
}
/*
@@ -541,7 +630,8 @@ void __trace_var(u32 event, int cycles,
/* Write the original record */
__insert_record(buf, event, extra, cycles, rec_size, extra_data);
- local_irq_restore(flags);
+unlock:
+ spin_unlock_irqrestore(&this_cpu(t_lock), flags);
/* Notify trace buffer consumer that we've crossed the high water mark. */
if ( started_below_highwater &&
diff -r b0b41e735575 -r fad80160c001 xen/include/public/sysctl.h
--- a/xen/include/public/sysctl.h Wed Jan 20 09:51:38 2010 +0000
+++ b/xen/include/public/sysctl.h Wed Jan 20 20:33:35 2010 +0000
@@ -75,7 +75,7 @@ struct xen_sysctl_tbuf_op {
uint32_t evt_mask;
/* OUT variables */
uint64_aligned_t buffer_mfn;
- uint32_t size;
+ uint32_t size; /* Also an IN variable! */
};
typedef struct xen_sysctl_tbuf_op xen_sysctl_tbuf_op_t;
DEFINE_XEN_GUEST_HANDLE(xen_sysctl_tbuf_op_t);
diff -r b0b41e735575 -r fad80160c001 xen/include/public/trace.h
--- a/xen/include/public/trace.h Wed Jan 20 09:51:38 2010 +0000
+++ b/xen/include/public/trace.h Wed Jan 20 20:33:35 2010 +0000
@@ -195,6 +195,16 @@ struct t_buf {
/* Records follow immediately after the meta-data header. */
};
+/* Structure used to pass MFNs to the trace buffers back to trace consumers.
+ * Offset is an offset into the mapped structure where the mfn list will be
held.
+ * MFNs will be at ((unsigned long *)(t_info))+(t_info->cpu_offset[cpu]).
+ */
+struct t_info {
+ uint16_t tbuf_size; /* Size in pages of each trace buffer */
+ uint16_t mfn_offset[]; /* Offset within t_info structure of the page list
per cpu */
+ /* MFN lists immediately after the header */
+};
+
#endif /* __XEN_PUBLIC_TRACE_H__ */
/*
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|