Add "page deduplication" capability (with optional compression
and trailing-zero elimination) to Xen's tmem.
(Transparent to tmem-enabled guests.) Ephemeral pages
that have the exact same content are "combined" so that only
one page frame is needed. Since ephemeral pages are essentially
read-only, no C-O-W (and thus no equivalent of swapping) is
necessary. Deduplication can be combined with compression
or "trailing zero elimination" for even more space savings.
No non-tmem code is affected.
Signed-off-by: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>
Points of interest:
- Modifications to LRU eviction algorithm to accommodate
dedup'ed pages
- New data structures to allow lookup of matching pages
and track references. (Algorithm used is similar to
that used by KSM in KVM/Linux: No hashing required.)
- Lock (and rbtree) chosen by first byte of data to
allow reasonably high concurrency without greatly
complicating lock management.
- Statistics added so "dedup ratio" can be monitored.
- Dedup is disabled/enabled by Xen command line option
and must be combined with the tmem Xen option.
Trailing zero elimination ("tze") saves significant tmem
RAM when many data files are less than 1-3 pages and remaining
(unused) space at the end of a page is zero-filled on disk
or in memory, as may be the case for example when VMs are
serving a large number of (deduplicate'able) web pages.
Compression already was a tmem option; v2 of this patch
combines compression with deduplication to further
improve tmem RAM utilization.
Either option (tmem_tze or tmem_compress) can be enabled
at xen boot time in addition to deduplication (tmem_dedup)
but compression overrides/disables tze. Both have a
significant CPU cost so are useful primarily when
memory is more constrained than CPU cycles, for example
on a many-core machine with many low CPU-utilization
RAM-needy VMs.
tools/misc/xen-tmem-list-parse.c | 30 ++
xen/common/tmem.c | 474 ++++++++++++++++++++++++++++++++-------
xen/common/tmem_xen.c | 33 ++
xen/include/xen/tmem_xen.h | 122 +++++++++-
4 files changed, 576 insertions(+), 83 deletions(-)
diff -r b8d2a4134a68 tools/misc/xen-tmem-list-parse.c
--- a/tools/misc/xen-tmem-list-parse.c Wed Mar 03 17:41:58 2010 +0000
+++ b/tools/misc/xen-tmem-list-parse.c Mon Apr 05 16:09:58 2010 -0600
@@ -110,13 +110,39 @@ void parse_global(char *s)
unsigned long long rtree_node_max = parse(s,"Nm");
unsigned long long pgp_count = parse(s,"Pc");
unsigned long long pgp_max = parse(s,"Pm");
+ unsigned long long page_count = parse(s,"Fc");
+ unsigned long long max_page_count = parse(s,"Fm");
+ unsigned long long pcd_count = parse(s,"Sc");
+ unsigned long long max_pcd_count = parse(s,"Sm");
+ unsigned long long pcd_tot_tze_size = parse(s,"Zt");
+ unsigned long long pcd_tot_csize = parse(s,"Gz");
+ unsigned long long deduped_puts = parse(s,"Gd");
+ unsigned long long tot_good_eph_puts = parse(s,"Ep");
printf("total tmem ops=%llu (errors=%llu) -- tmem pages avail=%llu\n",
total_ops, errored_ops, avail_pages);
printf("datastructs: objs=%llu (max=%llu) pgps=%llu (max=%llu) "
- "nodes=%llu (max=%llu)\n",
+ "nodes=%llu (max=%llu) pages=%llu (max=%llu) ",
obj_count, obj_max, pgp_count, pgp_max,
- rtree_node_count, rtree_node_max);
+ rtree_node_count, rtree_node_max,
+ page_count,max_page_count);
+ if (max_pcd_count != 0 && global_eph_count != 0 && tot_good_eph_puts != 0)
{
+ printf("pcds=%llu (max=%llu) ",
+ pcd_count,max_pcd_count);
+ printf("deduped: avg=%4.2f%% (curr=%4.2f%%) ",
+ ((deduped_puts*1.0)/tot_good_eph_puts)*100,
+ (1.0-(pcd_count*1.0)/global_eph_count)*100);
+ }
+ if (pcd_count != 0)
+ {
+ if (pcd_tot_tze_size && (pcd_tot_tze_size < pcd_count*PAGE_SIZE))
+ printf("tze savings=%4.2f%% ",
+ (1.0-(pcd_tot_tze_size*1.0)/(pcd_count*PAGE_SIZE))*100);
+ if (pcd_tot_csize && (pcd_tot_csize < pcd_count*PAGE_SIZE))
+ printf("compression savings=%4.2f%% ",
+ (1.0-(pcd_tot_csize*1.0)/(pcd_count*PAGE_SIZE))*100);
+ }
+ printf("\n");
printf("misc: failed_copies=%llu alloc_failed=%llu alloc_page_failed=%llu "
"low_mem=%llu evicted=%llu/%llu relinq=%llu/%llu, "
"max_evicts_per_relinq=%llu, flush_pools=%llu, "
diff -r b8d2a4134a68 xen/common/tmem.c
--- a/xen/common/tmem.c Wed Mar 03 17:41:58 2010 +0000
+++ b/xen/common/tmem.c Mon Apr 05 16:09:58 2010 -0600
@@ -6,11 +6,10 @@
* Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
*/
-/* TODO list: 090129
- - improve on reclamation policy
+/* TODO list: 090129 (updated 100318)
+ - any better reclamation policy?
- use different tlsf pools for each client (maybe each pool)
- - implement page accounting and minimal QoS limits
- - test shared access more completely (need pv cluster fs)
+ - test shared access more completely (ocfs2)
- add feedback-driven compression (not for persistent pools though!)
- add data-structure total bytes overhead stats
*/
@@ -77,12 +76,17 @@ static unsigned long relinq_pgs = 0, rel
static unsigned long relinq_pgs = 0, relinq_attempts = 0;
static unsigned long max_evicts_per_relinq = 0;
static unsigned long low_on_memory = 0;
+static unsigned long deduped_puts = 0;
+static unsigned long tot_good_eph_puts = 0;
static int global_obj_count_max = 0;
static int global_pgp_count_max = 0;
+static int global_pcd_count_max = 0;
static int global_page_count_max = 0;
static int global_rtree_node_count_max = 0;
static long global_eph_count_max = 0;
static unsigned long failed_copies;
+static unsigned long pcd_tot_tze_size = 0;
+static unsigned long pcd_tot_csize = 0;
DECL_CYC_COUNTER(succ_get);
DECL_CYC_COUNTER(succ_put);
@@ -108,6 +112,7 @@ DECL_CYC_COUNTER(decompress);
struct tm_pool;
struct tmem_page_descriptor;
+struct tmem_page_content_descriptor;
struct client {
struct list_head client_list;
struct tm_pool *pools[MAX_POOLS_PER_DOMAIN];
@@ -219,12 +224,17 @@ struct tmem_page_descriptor {
obj_t *obj;
uint64_t inv_oid; /* used for invalid list only */
};
+ pagesize_t size; /* 0 == PAGE_SIZE (pfp), -1 == data invalid,
+ else compressed data (cdata) */
uint32_t index;
- size_t size; /* 0 == PAGE_SIZE (pfp), -1 == data invalid,
- else compressed data (cdata) */
+ /* must hold pcd_tree_rwlocks[firstbyte] to use pcd pointer/siblings */
+ uint16_t firstbyte; /* NON_SHAREABLE->pfp otherwise->pcd */
+ bool_t eviction_attempted; /* CHANGE TO lifetimes? (settable) */
+ struct list_head pcd_siblings;
union {
pfp_t *pfp; /* page frame pointer */
char *cdata; /* compressed data */
+ struct tmem_page_content_descriptor *pcd; /* page dedup */
};
union {
uint64_t timestamp;
@@ -233,6 +243,25 @@ struct tmem_page_descriptor {
DECL_SENTINEL
};
typedef struct tmem_page_descriptor pgp_t;
+
+#define PCD_TZE_MAX_SIZE (PAGE_SIZE - (PAGE_SIZE/64))
+
+struct tmem_page_content_descriptor {
+ union {
+ pfp_t *pfp; /* page frame pointer */
+ char *cdata; /* if compression_enabled */
+ char *tze; /* if !compression_enabled, trailing zeroes eliminated */
+ };
+ struct list_head pgp_list;
+ struct rb_node pcd_rb_tree_node;
+ uint32_t pgp_ref_count;
+ pagesize_t size; /* if compression_enabled -> 0<size<PAGE_SIZE (*cdata)
+ * else if tze, 0<=size<PAGE_SIZE, rounded up to mult of 8
+ * else PAGE_SIZE -> *pfp */
+};
+typedef struct tmem_page_content_descriptor pcd_t;
+struct rb_root pcd_tree_roots[256]; /* choose based on first byte of page */
+rwlock_t pcd_tree_rwlocks[256]; /* poor man's concurrency for now */
static LIST_HEAD(global_ephemeral_page_list); /* all pages in ephemeral pools
*/
@@ -267,6 +296,7 @@ static long global_eph_count = 0; /* ato
static long global_eph_count = 0; /* atomicity depends on eph_lists_spinlock */
static atomic_t global_obj_count = ATOMIC_INIT(0);
static atomic_t global_pgp_count = ATOMIC_INIT(0);
+static atomic_t global_pcd_count = ATOMIC_INIT(0);
static atomic_t global_page_count = ATOMIC_INIT(0);
static atomic_t global_rtree_node_count = ATOMIC_INIT(0);
@@ -336,6 +366,229 @@ static NOINLINE void tmem_page_free(pool
atomic_dec_and_assert(global_page_count);
}
+/************ PAGE CONTENT DESCRIPTOR MANIPULATION ROUTINES ***********/
+
+#define NOT_SHAREABLE ((uint16_t)-1UL)
+
+static NOINLINE int pcd_copy_to_client(tmem_cli_mfn_t cmfn, pgp_t *pgp)
+{
+ uint8_t firstbyte = pgp->firstbyte;
+ pcd_t *pcd;
+ int ret;
+
+ ASSERT(tmh_dedup_enabled());
+ tmem_read_lock(&pcd_tree_rwlocks[firstbyte]);
+ pcd = pgp->pcd;
+ if ( pgp->size < PAGE_SIZE && pgp->size != 0 &&
+ pcd->size < PAGE_SIZE && pcd->size != 0 )
+ ret = tmh_decompress_to_client(cmfn, pcd->cdata, pcd->size, NULL);
+ else if ( tmh_tze_enabled() && pcd->size < PAGE_SIZE )
+ ret = tmh_copy_tze_to_client(cmfn, pcd->tze, pcd->size);
+ else
+ ret = tmh_copy_to_client(cmfn, pcd->pfp, 0, 0, PAGE_SIZE, NULL);
+ tmem_read_unlock(&pcd_tree_rwlocks[firstbyte]);
+ return ret;
+}
+
+/* ensure pgp no longer points to pcd, nor vice-versa */
+/* take pcd rwlock unless have_pcd_rwlock is set, always unlock when done */
+static NOINLINE void pcd_disassociate(pgp_t *pgp, pool_t *pool, bool_t
have_pcd_rwlock)
+{
+ pcd_t *pcd = pgp->pcd;
+ pfp_t *pfp = pgp->pcd->pfp;
+ uint16_t firstbyte = pgp->firstbyte;
+ char *pcd_tze = pgp->pcd->tze;
+ pagesize_t pcd_size = pcd->size;
+ pagesize_t pgp_size = pgp->size;
+ char *pcd_cdata = pgp->pcd->cdata;
+ pagesize_t pcd_csize = pgp->pcd->size;
+
+ ASSERT(tmh_dedup_enabled());
+ ASSERT(firstbyte != NOT_SHAREABLE);
+ ASSERT(firstbyte < 256);
+
+ if ( have_pcd_rwlock )
+ ASSERT_WRITELOCK(&pcd_tree_rwlocks[firstbyte]);
+ else
+ tmem_write_lock(&pcd_tree_rwlocks[firstbyte]);
+ list_del_init(&pgp->pcd_siblings);
+ pgp->pcd = NULL;
+ pgp->firstbyte = NOT_SHAREABLE;
+ pgp->size = -1;
+ if ( --pcd->pgp_ref_count )
+ {
+ tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
+ return;
+ }
+
+ /* no more references to this pcd, recycle it and the physical page */
+ ASSERT(list_empty(&pcd->pgp_list));
+ pcd->pfp = NULL;
+ /* remove pcd from rbtree */
+ rb_erase(&pcd->pcd_rb_tree_node,&pcd_tree_roots[firstbyte]);
+ /* reinit the struct for safety for now */
+ RB_CLEAR_NODE(&pcd->pcd_rb_tree_node);
+ /* now free up the pcd memory */
+ tmem_free(pcd,sizeof(pcd_t),NULL);
+ atomic_dec_and_assert(global_pcd_count);
+ if ( pgp_size != 0 && pcd_size < PAGE_SIZE )
+ {
+ /* compressed data */
+ tmem_free(pcd_cdata,pcd_csize,pool);
+ pcd_tot_csize -= pcd_csize;
+ }
+ else if ( pcd_size != PAGE_SIZE )
+ {
+ /* trailing zero data */
+ pcd_tot_tze_size -= pcd_size;
+ if ( pcd_size )
+ tmem_free(pcd_tze,pcd_size,pool);
+ } else {
+ /* real physical page */
+ if ( tmh_tze_enabled() )
+ pcd_tot_tze_size -= PAGE_SIZE;
+ if ( tmh_compression_enabled() )
+ pcd_tot_csize -= PAGE_SIZE;
+ tmem_page_free(pool,pfp);
+ }
+ tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
+}
+
+
+static NOINLINE int pcd_associate(pgp_t *pgp, char *cdata, pagesize_t csize)
+{
+ struct rb_node **new, *parent = NULL;
+ struct rb_root *root;
+ pcd_t *pcd;
+ int cmp;
+ pagesize_t pfp_size = 0;
+ uint8_t firstbyte = (cdata == NULL) ? tmh_get_first_byte(pgp->pfp) :
*cdata;
+ int ret = 0;
+
+ if ( !tmh_dedup_enabled() )
+ return 0;
+ ASSERT(pgp->obj != NULL);
+ ASSERT(pgp->obj->pool != NULL);
+ ASSERT(!pgp->obj->pool->persistent);
+ if ( cdata == NULL )
+ {
+ ASSERT(pgp->pfp != NULL);
+ pfp_size = PAGE_SIZE;
+ if ( tmh_tze_enabled() )
+ {
+ pfp_size = tmh_tze_pfp_scan(pgp->pfp);
+ if ( pfp_size > PCD_TZE_MAX_SIZE )
+ pfp_size = PAGE_SIZE;
+ }
+ ASSERT(pfp_size <= PAGE_SIZE);
+ ASSERT(!(pfp_size & (sizeof(uint64_t)-1)));
+ }
+ tmem_write_lock(&pcd_tree_rwlocks[firstbyte]);
+
+ /* look for page match */
+ root = &pcd_tree_roots[firstbyte];
+ new = &(root->rb_node);
+ while ( *new )
+ {
+ pcd = container_of(*new, pcd_t, pcd_rb_tree_node);
+ parent = *new;
+ /* compare new entry and rbtree entry, set cmp accordingly */
+ if ( cdata != NULL )
+ {
+ if ( pcd->size < PAGE_SIZE )
+ /* both new entry and rbtree entry are compressed */
+ cmp = tmh_pcd_cmp(cdata,csize,pcd->cdata,pcd->size);
+ else
+ /* new entry is compressed, rbtree entry is not */
+ cmp = -1;
+ } else if ( pcd->size < PAGE_SIZE )
+ /* rbtree entry is compressed, rbtree entry is not */
+ cmp = 1;
+ else if ( tmh_tze_enabled() ) {
+ if ( pcd->size < PAGE_SIZE )
+ /* both new entry and rbtree entry are trailing zero */
+ cmp = tmh_tze_pfp_cmp(pgp->pfp,pfp_size,pcd->tze,pcd->size);
+ else
+ /* new entry is trailing zero, rbtree entry is not */
+ cmp = tmh_tze_pfp_cmp(pgp->pfp,pfp_size,pcd->pfp,PAGE_SIZE);
+ } else {
+ /* both new entry and rbtree entry are full physical pages */
+ ASSERT(pgp->pfp != NULL);
+ ASSERT(pcd->pfp != NULL);
+ cmp = tmh_page_cmp(pgp->pfp,pcd->pfp);
+ }
+
+ /* walk tree or match depending on cmp */
+ if ( cmp < 0 )
+ new = &((*new)->rb_left);
+ else if ( cmp > 0 )
+ new = &((*new)->rb_right);
+ else
+ {
+ /* match! if not compressed, free the no-longer-needed page */
+ /* but if compressed, data is assumed static so don't free! */
+ if ( cdata == NULL )
+ tmem_page_free(pgp->obj->pool,pgp->pfp);
+ deduped_puts++;
+ goto match;
+ }
+ }
+
+ /* exited while loop with no match, so alloc a pcd and put it in the tree
*/
+ if ( (pcd = tmem_malloc(pcd_t, NULL)) == NULL )
+ {
+ ret = -ENOMEM;
+ goto unlock;
+ } else if ( cdata != NULL ) {
+ if ( (pcd->cdata = tmem_malloc_bytes(csize,pgp->obj->pool)) == NULL )
+ {
+ tmem_free(pcd,sizeof(pcd_t),NULL);
+ ret = -ENOMEM;
+ goto unlock;
+ }
+ }
+ atomic_inc_and_max(global_pcd_count);
+ RB_CLEAR_NODE(&pcd->pcd_rb_tree_node); /* is this necessary */
+ INIT_LIST_HEAD(&pcd->pgp_list); /* is this necessary */
+ pcd->pgp_ref_count = 0;
+ if ( cdata != NULL )
+ {
+ memcpy(pcd->cdata,cdata,csize);
+ pcd->size = csize;
+ pcd_tot_csize += csize;
+ } else if ( pfp_size == 0 ) {
+ ASSERT(tmh_tze_enabled());
+ pcd->size = 0;
+ pcd->tze = NULL;
+ } else if ( pfp_size < PAGE_SIZE &&
+ ((pcd->tze = tmem_malloc_bytes(pfp_size,pgp->obj->pool)) != NULL) ) {
+ tmh_tze_copy_from_pfp(pcd->tze,pgp->pfp,pfp_size);
+ pcd->size = pfp_size;
+ pcd_tot_tze_size += pfp_size;
+ tmem_page_free(pgp->obj->pool,pgp->pfp);
+ } else {
+ pcd->pfp = pgp->pfp;
+ pcd->size = PAGE_SIZE;
+ if ( tmh_tze_enabled() )
+ pcd_tot_tze_size += PAGE_SIZE;
+ if ( tmh_compression_enabled() )
+ pcd_tot_csize += PAGE_SIZE;
+ }
+ rb_link_node(&pcd->pcd_rb_tree_node, parent, new);
+ rb_insert_color(&pcd->pcd_rb_tree_node, root);
+
+match:
+ pcd->pgp_ref_count++;
+ list_add(&pgp->pcd_siblings,&pcd->pgp_list);
+ pgp->firstbyte = firstbyte;
+ pgp->eviction_attempted = 0;
+ pgp->pcd = pcd;
+
+unlock:
+ tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
+ return ret;
+}
+
/************ PAGE DESCRIPTOR MANIPULATION ROUTINES *******************/
/* allocate a pgp_t and associate it with an object */
@@ -353,6 +606,12 @@ static NOINLINE pgp_t *pgp_alloc(obj_t *
INIT_LIST_HEAD(&pgp->global_eph_pages);
INIT_LIST_HEAD(&pgp->client_eph_pages);
pgp->pfp = NULL;
+ if ( tmh_dedup_enabled() )
+ {
+ pgp->firstbyte = NOT_SHAREABLE;
+ pgp->eviction_attempted = 0;
+ INIT_LIST_HEAD(&pgp->pcd_siblings);
+ }
pgp->size = -1;
pgp->index = -1;
pgp->timestamp = get_cycles();
@@ -374,18 +633,20 @@ static pgp_t *pgp_lookup_in_obj(obj_t *o
static NOINLINE void pgp_free_data(pgp_t *pgp, pool_t *pool)
{
+ pagesize_t pgp_size = pgp->size;
+
if ( pgp->pfp == NULL )
return;
- if ( !pgp->size )
+ if ( tmh_dedup_enabled() && pgp->firstbyte != NOT_SHAREABLE )
+ pcd_disassociate(pgp,pool,0); /* pgp->size lost */
+ else if ( pgp_size )
+ tmem_free(pgp->cdata,pgp_size,pool);
+ else
tmem_page_free(pgp->obj->pool,pgp->pfp);
- else
+ if ( pool != NULL && pgp_size )
{
- tmem_free(pgp->cdata,pgp->size,pool);
- if ( pool != NULL )
- {
- pool->client->compressed_pages--;
- pool->client->compressed_sum_size -= pgp->size;
- }
+ pool->client->compressed_pages--;
+ pool->client->compressed_sum_size -= pgp_size;
}
pgp->pfp = NULL;
pgp->size = -1;
@@ -987,10 +1248,56 @@ static void client_freeze(client_t *clie
/************ MEMORY REVOCATION ROUTINES *******************************/
+static bool_t tmem_try_to_evict_pgp(pgp_t *pgp, bool_t *hold_pool_rwlock)
+{
+ obj_t *obj = pgp->obj;
+ pool_t *pool = obj->pool;
+ client_t *client = pool->client;
+ uint16_t firstbyte = pgp->firstbyte;
+
+ if ( pool->is_dying )
+ return 0;
+ if ( tmh_lock_all && !obj->no_evict )
+ return 1;
+ if ( tmem_spin_trylock(&obj->obj_spinlock) )
+ {
+ if ( tmh_dedup_enabled() )
+ {
+ firstbyte = pgp->firstbyte;
+ if ( firstbyte == NOT_SHAREABLE )
+ goto obj_unlock;
+ ASSERT(firstbyte < 256);
+ if ( !tmem_write_trylock(&pcd_tree_rwlocks[firstbyte]) )
+ goto obj_unlock;
+ if ( pgp->pcd->pgp_ref_count > 1 && !pgp->eviction_attempted )
+ {
+ pgp->eviction_attempted++;
+ list_del(&pgp->global_eph_pages);
+
list_add_tail(&pgp->global_eph_pages,&global_ephemeral_page_list);
+ list_del(&pgp->client_eph_pages);
+
list_add_tail(&pgp->client_eph_pages,&client->ephemeral_page_list);
+ goto pcd_unlock;
+ }
+ }
+ if ( obj->pgp_count > 1 )
+ return 1;
+ if ( tmem_write_trylock(&pool->pool_rwlock) )
+ {
+ *hold_pool_rwlock = 1;
+ return 1;
+ }
+pcd_unlock:
+ tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
+obj_unlock:
+ tmem_spin_unlock(&obj->obj_spinlock);
+ }
+ return 0;
+}
+
static int tmem_evict(void)
{
client_t *client = tmh_client_from_current();
- pgp_t *pgp = NULL, *pgp_del;
+ pgp_t *pgp = NULL, *pgp2, *pgp_del;
obj_t *obj;
pool_t *pool;
int ret = 0;
@@ -1001,49 +1308,15 @@ static int tmem_evict(void)
if ( (client != NULL) && client_over_quota(client) &&
!list_empty(&client->ephemeral_page_list) )
{
- list_for_each_entry(pgp,&client->ephemeral_page_list,client_eph_pages)
- {
- obj = pgp->obj;
- pool = obj->pool;
- if ( pool->is_dying )
- continue;
- if ( tmh_lock_all && !obj->no_evict )
+
list_for_each_entry_safe(pgp,pgp2,&client->ephemeral_page_list,client_eph_pages)
+ if ( tmem_try_to_evict_pgp(pgp,&hold_pool_rwlock) )
goto found;
- if ( tmem_spin_trylock(&obj->obj_spinlock) )
- {
- if ( obj->pgp_count > 1 )
- goto found;
- if ( tmem_write_trylock(&pool->pool_rwlock) )
- {
- hold_pool_rwlock = 1;
- goto found;
- }
- tmem_spin_unlock(&obj->obj_spinlock);
- }
- }
} else if ( list_empty(&global_ephemeral_page_list) ) {
goto out;
} else {
- list_for_each_entry(pgp,&global_ephemeral_page_list,global_eph_pages)
- {
- obj = pgp->obj;
- pool = obj->pool;
- if ( pool->is_dying )
- continue;
- if ( tmh_lock_all && !obj->no_evict )
+
list_for_each_entry_safe(pgp,pgp2,&global_ephemeral_page_list,global_eph_pages)
+ if ( tmem_try_to_evict_pgp(pgp,&hold_pool_rwlock) )
goto found;
- if ( tmem_spin_trylock(&obj->obj_spinlock) )
- {
- if ( obj->pgp_count > 1 )
- goto found;
- if ( tmem_write_trylock(&pool->pool_rwlock) )
- {
- hold_pool_rwlock = 1;
- goto found;
- }
- tmem_spin_unlock(&obj->obj_spinlock);
- }
- }
}
ret = 0;
@@ -1057,10 +1330,16 @@ found:
ASSERT(obj->no_evict == 0);
ASSERT(obj->pool != NULL);
ASSERT_SENTINEL(obj,OBJ);
+ pool = obj->pool;
ASSERT_SPINLOCK(&obj->obj_spinlock);
pgp_del = pgp_delete_from_obj(obj, pgp->index);
ASSERT(pgp_del == pgp);
+ if ( tmh_dedup_enabled() && pgp->firstbyte != NOT_SHAREABLE )
+ {
+ ASSERT(pgp->pcd->pgp_ref_count == 1 || pgp->eviction_attempted);
+ pcd_disassociate(pgp,pool,1);
+ }
pgp_delete(pgp,1);
if ( obj->pgp_count == 0 )
{
@@ -1129,25 +1408,30 @@ static NOINLINE int do_tmem_put_compress
#ifdef __i386__
return -ENOMEM;
#endif
+
if ( pgp->pfp != NULL )
- pgp_free_data(pgp, pgp->obj->pool); /* FIXME... is this right? */
+ pgp_free_data(pgp, pgp->obj->pool);
START_CYC_COUNTER(compress);
ret = tmh_compress_from_client(cmfn, &dst, &size, cva);
if ( (ret == -EFAULT) || (ret == 0) )
goto out;
- else if ( (size == 0) || (size >= tmem_subpage_maxsize()) )
+ else if ( (size == 0) || (size >= tmem_subpage_maxsize()) ) {
ret = 0;
- else if ( (p = tmem_malloc_bytes(size,pgp->obj->pool)) == NULL )
+ goto out;
+ } else if ( tmh_dedup_enabled() && !is_persistent(pgp->obj->pool) ) {
+ if ( (ret = pcd_associate(pgp,dst,size)) == -ENOMEM )
+ goto out;
+ } else if ( (p = tmem_malloc_bytes(size,pgp->obj->pool)) == NULL ) {
ret = -ENOMEM;
- else
- {
+ goto out;
+ } else {
memcpy(p,dst,size);
pgp->cdata = p;
- pgp->size = size;
- pgp->obj->pool->client->compressed_pages++;
- pgp->obj->pool->client->compressed_sum_size += size;
- ret = 1;
}
+ pgp->size = size;
+ pgp->obj->pool->client->compressed_pages++;
+ pgp->obj->pool->client->compressed_sum_size += size;
+ ret = 1;
out:
END_CYC_COUNTER(compress);
@@ -1155,7 +1439,7 @@ out:
}
static NOINLINE int do_tmem_dup_put(pgp_t *pgp, tmem_cli_mfn_t cmfn,
- uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len, void *cva)
+ pagesize_t tmem_offset, pagesize_t pfn_offset, pagesize_t len, void
*cva)
{
pool_t *pool;
obj_t *obj;
@@ -1197,6 +1481,11 @@ copy_uncompressed:
ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,0);
if ( ret == -EFAULT )
goto bad_copy;
+ if ( tmh_dedup_enabled() && !is_persistent(pool) )
+ {
+ if ( pcd_associate(pgp,NULL,0) == -ENOMEM )
+ goto failed_dup;
+ }
pgp->size = 0;
done:
@@ -1239,8 +1528,8 @@ failed_dup:
static NOINLINE int do_tmem_put(pool_t *pool,
uint64_t oid, uint32_t index,
- tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
- uint32_t pfn_offset, uint32_t len, void *cva)
+ tmem_cli_mfn_t cmfn, pagesize_t tmem_offset,
+ pagesize_t pfn_offset, pagesize_t len, void *cva)
{
obj_t *obj = NULL, *objfound = NULL, *objnew = NULL;
pgp_t *pgp = NULL, *pgpdel = NULL;
@@ -1308,13 +1597,18 @@ copy_uncompressed:
copy_uncompressed:
if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
{
- ret == -ENOMEM;
+ ret = -ENOMEM;
goto delete_and_free;
}
/* tmh_copy_from_client properly handles len==0 (TMEM_NEW_PAGE) */
ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,cva);
if ( ret == -EFAULT )
goto bad_copy;
+ if ( tmh_dedup_enabled() && !is_persistent(pool) )
+ {
+ if ( pcd_associate(pgp,NULL,0) == -ENOMEM )
+ goto delete_and_free;
+ }
pgp->size = 0;
insert_page:
@@ -1344,6 +1638,8 @@ insert_page:
pool->good_puts++;
if ( is_persistent(pool) )
client->succ_pers_puts++;
+ else
+ tot_good_eph_puts++;
return 1;
delete_and_free:
@@ -1376,8 +1672,8 @@ ASSERT(0);
}
static NOINLINE int do_tmem_get(pool_t *pool, uint64_t oid, uint32_t index,
- tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
- uint32_t pfn_offset, uint32_t len, void *cva)
+ tmem_cli_mfn_t cmfn, pagesize_t tmem_offset,
+ pagesize_t pfn_offset, pagesize_t len, void *cva)
{
obj_t *obj;
pgp_t *pgp;
@@ -1404,15 +1700,18 @@ static NOINLINE int do_tmem_get(pool_t *
return 0;
}
ASSERT(pgp->size != -1);
- if ( pgp->size != 0 )
+ if ( tmh_dedup_enabled() && !is_persistent(pool) &&
+ pgp->firstbyte != NOT_SHAREABLE )
{
+ if ( pcd_copy_to_client(cmfn, pgp) == -EFAULT )
+ goto bad_copy;
+ } else if ( pgp->size != 0 ) {
START_CYC_COUNTER(decompress);
if ( tmh_decompress_to_client(cmfn, pgp->cdata,
pgp->size, cva) == -EFAULT )
goto bad_copy;
END_CYC_COUNTER(decompress);
- }
- else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset,
+ } else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset,
pfn_offset, len, cva) == -EFAULT)
goto bad_copy;
if ( is_ephemeral(pool) )
@@ -1855,11 +2154,15 @@ static int tmemc_list_global(tmem_cli_va
total_flush_pool, use_long ? ',' : '\n');
if (use_long)
n += scnprintf(info+n,BSIZE-n,
- "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d\n",
+ "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d,"
+ "Fc:%d,Fm:%d,Sc:%d,Sm:%d,Ep:%lu,Gd:%lu,Zt:%lu,Gz:%lu\n",
global_eph_count, global_eph_count_max,
_atomic_read(global_obj_count), global_obj_count_max,
_atomic_read(global_rtree_node_count), global_rtree_node_count_max,
- _atomic_read(global_pgp_count), global_pgp_count_max);
+ _atomic_read(global_pgp_count), global_pgp_count_max,
+ _atomic_read(global_page_count), global_page_count_max,
+ _atomic_read(global_pcd_count), global_pcd_count_max,
+ tot_good_eph_puts,deduped_puts,pcd_tot_tze_size,pcd_tot_csize);
if ( sum + n >= len )
return sum;
tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
@@ -1912,6 +2215,13 @@ static int tmemc_set_var_one(client_t *c
#ifdef __i386__
return -1;
#endif
+ if ( tmh_dedup_enabled() )
+ {
+ printk("tmem: compression %s for all %ss, cannot be changed "
+ "when tmem_dedup is enabled\n",
+ tmh_compression_enabled() ? "enabled" : "disabled",client_str);
+ return -1;
+ }
client->compress = arg1 ? 1 : 0;
printk("tmem: compression %s for %s=%d\n",
arg1 ? "enabled" : "disabled",cli_id_str,cli_id);
@@ -2569,14 +2879,28 @@ EXPORT void *tmem_relinquish_pages(unsig
/* called at hypervisor startup */
EXPORT void init_tmem(void)
{
+ int i;
if ( !tmh_enabled() )
return;
radix_tree_init();
+ if ( tmh_dedup_enabled() )
+ for (i = 0; i < 256; i++ )
+ {
+ pcd_tree_roots[i] = RB_ROOT;
+ rwlock_init(&pcd_tree_rwlocks[i]);
+ }
+
if ( tmh_init() )
{
- printk("tmem: initialized comp=%d global-lock=%d\n",
- tmh_compression_enabled(), tmh_lock_all);
+ printk("tmem: initialized comp=%d dedup=%d tze=%d global-lock=%d\n",
+ tmh_compression_enabled(), tmh_dedup_enabled(), tmh_tze_enabled(),
+ tmh_lock_all);
+ if ( tmh_dedup_enabled()&&tmh_compression_enabled()&&tmh_tze_enabled()
)
+ {
+ tmh_tze_disable();
+ printk("tmem: tze and compression not compatible, disabling
tze\n");
+ }
tmem_initialized = 1;
}
else
diff -r b8d2a4134a68 xen/common/tmem_xen.c
--- a/xen/common/tmem_xen.c Wed Mar 03 17:41:58 2010 +0000
+++ b/xen/common/tmem_xen.c Mon Apr 05 16:09:58 2010 -0600
@@ -19,6 +19,12 @@ boolean_param("tmem", opt_tmem);
EXPORT int opt_tmem_compress = 0;
boolean_param("tmem_compress", opt_tmem_compress);
+
+EXPORT int opt_tmem_dedup = 0;
+boolean_param("tmem_dedup", opt_tmem_dedup);
+
+EXPORT int opt_tmem_tze = 0;
+boolean_param("tmem_tze", opt_tmem_tze);
EXPORT int opt_tmem_shared_auth = 0;
boolean_param("tmem_shared_auth", opt_tmem_shared_auth);
@@ -103,8 +109,8 @@ static inline void *cli_mfn_to_va(tmem_c
#endif
EXPORT int tmh_copy_from_client(pfp_t *pfp,
- tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
- uint32_t pfn_offset, uint32_t len, void *cli_va)
+ tmem_cli_mfn_t cmfn, pagesize_t tmem_offset,
+ pagesize_t pfn_offset, pagesize_t len, void *cli_va)
{
unsigned long tmem_mfn;
void *tmem_va;
@@ -148,7 +154,7 @@ EXPORT int tmh_compress_from_client(tmem
}
EXPORT int tmh_copy_to_client(tmem_cli_mfn_t cmfn, pfp_t *pfp,
- uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len, void *cli_va)
+ pagesize_t tmem_offset, pagesize_t pfn_offset, pagesize_t len, void
*cli_va)
{
unsigned long tmem_mfn, cli_mfn = 0;
int mark_dirty = 1;
@@ -195,6 +201,27 @@ EXPORT int tmh_decompress_to_client(tmem
unmap_domain_page(cli_va);
paging_mark_dirty(current->domain,cli_mfn);
}
+ mb();
+ return 1;
+}
+
+EXPORT int tmh_copy_tze_to_client(tmem_cli_mfn_t cmfn, void *tmem_va,
+ pagesize_t len)
+{
+ void *cli_va;
+ unsigned long cli_mfn;
+
+ ASSERT(!(len & (sizeof(uint64_t)-1)));
+ ASSERT(len <= PAGE_SIZE);
+ ASSERT(len > 0 || tmem_va == NULL);
+ if ( (cli_va = cli_mfn_to_va(cmfn,&cli_mfn)) == NULL)
+ return -EFAULT;
+ if ( len > 0 )
+ memcpy((char *)cli_va,(char *)tmem_va,len);
+ if ( len < PAGE_SIZE )
+ memset((char *)cli_va+len,0,PAGE_SIZE-len);
+ unmap_domain_page(cli_va);
+ paging_mark_dirty(current->domain,cli_mfn);
mb();
return 1;
}
diff -r b8d2a4134a68 xen/include/xen/tmem_xen.h
--- a/xen/include/xen/tmem_xen.h Wed Mar 03 17:41:58 2010 +0000
+++ b/xen/include/xen/tmem_xen.h Mon Apr 05 16:09:58 2010 -0600
@@ -26,6 +26,8 @@ struct tmem_host_dependent_client {
};
typedef struct tmem_host_dependent_client tmh_client_t;
+typedef uint32_t pagesize_t; /* like size_t, must handle largest PAGE_SIZE */
+
#define IS_PAGE_ALIGNED(addr) \
((void *)((((unsigned long)addr + (PAGE_SIZE - 1)) & PAGE_MASK)) == addr)
#define IS_VALID_PAGE(_pi) ( mfn_valid(page_to_mfn(_pi)) )
@@ -52,6 +54,23 @@ static inline int tmh_compression_enable
static inline int tmh_compression_enabled(void)
{
return opt_tmem_compress;
+}
+
+extern int opt_tmem_dedup;
+static inline int tmh_dedup_enabled(void)
+{
+ return opt_tmem_dedup;
+}
+
+extern int opt_tmem_tze;
+static inline int tmh_tze_enabled(void)
+{
+ return opt_tmem_tze;
+}
+
+static inline void tmh_tze_disable(void)
+{
+ opt_tmem_tze = 0;
}
extern int opt_tmem_shared_auth;
@@ -326,6 +345,101 @@ static inline bool_t tmh_current_is_priv
return IS_PRIV(current->domain);
}
+static inline uint8_t tmh_get_first_byte(pfp_t *pfp)
+{
+ void *p = __map_domain_page(pfp);
+
+ return (uint8_t)(*(char *)p);
+}
+
+static inline int tmh_page_cmp(pfp_t *pfp1, pfp_t *pfp2)
+{
+ const uint64_t *p1 = (uint64_t *)__map_domain_page(pfp1);
+ const uint64_t *p2 = (uint64_t *)__map_domain_page(pfp2);
+ int i;
+
+ // FIXME: code in assembly?
+ASSERT(p1 != NULL);
+ASSERT(p2 != NULL);
+ for ( i = PAGE_SIZE/sizeof(uint64_t); i && *p1 == *p2; i--, *p1++, *p2++ );
+ if ( !i )
+ return 0;
+ if ( *p1 < *p2 )
+ return -1;
+ return 1;
+}
+
+static inline int tmh_pcd_cmp(void *va1, pagesize_t len1, void *va2,
pagesize_t len2)
+{
+ const char *p1 = (char *)va1;
+ const char *p2 = (char *)va2;
+ pagesize_t i;
+
+ ASSERT(len1 <= PAGE_SIZE);
+ ASSERT(len2 <= PAGE_SIZE);
+ if ( len1 < len2 )
+ return -1;
+ if ( len1 > len2 )
+ return 1;
+ ASSERT(len1 == len2);
+ for ( i = len2; i && *p1 == *p2; i--, *p1++, *p2++ );
+ if ( !i )
+ return 0;
+ if ( *p1 < *p2 )
+ return -1;
+ return 1;
+}
+
+static inline int tmh_tze_pfp_cmp(pfp_t *pfp1, pagesize_t pfp_len, void *tva,
pagesize_t tze_len)
+{
+ const uint64_t *p1 = (uint64_t *)__map_domain_page(pfp1);
+ const uint64_t *p2;
+ pagesize_t i;
+
+ if ( tze_len == PAGE_SIZE )
+ p2 = (uint64_t *)__map_domain_page((pfp_t *)tva);
+ else
+ p2 = (uint64_t *)tva;
+ ASSERT(pfp_len <= PAGE_SIZE);
+ ASSERT(!(pfp_len & (sizeof(uint64_t)-1)));
+ ASSERT(tze_len <= PAGE_SIZE);
+ ASSERT(!(tze_len & (sizeof(uint64_t)-1)));
+ if ( pfp_len < tze_len )
+ return -1;
+ if ( pfp_len > tze_len )
+ return 1;
+ ASSERT(pfp_len == tze_len);
+ for ( i = tze_len/sizeof(uint64_t); i && *p1 == *p2; i--, *p1++, *p2++ );
+ if ( !i )
+ return 0;
+ if ( *p1 < *p2 )
+ return -1;
+ return 1;
+}
+
+/* return the size of the data in the pfp, ignoring trailing zeroes and
+ * rounded up to the nearest multiple of 8 */
+static inline pagesize_t tmh_tze_pfp_scan(pfp_t *pfp)
+{
+ const uint64_t *p = (uint64_t *)__map_domain_page(pfp);
+ pagesize_t bytecount = PAGE_SIZE;
+ pagesize_t len = PAGE_SIZE/sizeof(uint64_t);
+ p += len;
+ while ( len-- && !*--p )
+ bytecount -= sizeof(uint64_t);
+ return bytecount;
+}
+
+static inline void tmh_tze_copy_from_pfp(void *tva, pfp_t *pfp, pagesize_t len)
+{
+ uint64_t *p1 = (uint64_t *)tva;
+ const uint64_t *p2 = (uint64_t *)__map_domain_page(pfp);
+
+ pagesize_t i;
+ ASSERT(!(len & (sizeof(uint64_t)-1)));
+ for ( i = len/sizeof(uint64_t); i--; *p1++ = *p2++);
+}
+
/* these typedefs are in the public/tmem.h interface
typedef XEN_GUEST_HANDLE(void) cli_mfn_t;
typedef XEN_GUEST_HANDLE(char) cli_va_t;
@@ -378,11 +492,13 @@ extern int tmh_compress_from_client(tmem
extern int tmh_compress_from_client(tmem_cli_mfn_t,void**,size_t *,void*);
extern int tmh_copy_from_client(pfp_t *pfp,
- tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
- uint32_t pfn_offset, uint32_t len, void *cva);
+ tmem_cli_mfn_t cmfn, pagesize_t tmem_offset,
+ pagesize_t pfn_offset, pagesize_t len, void *cva);
extern int tmh_copy_to_client(tmem_cli_mfn_t cmfn, pfp_t *pfp,
- uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len, void *cva);
+ pagesize_t tmem_offset, pagesize_t pfn_offset, pagesize_t len, void *cva);
+
+extern int tmh_copy_tze_to_client(tmem_cli_mfn_t cmfn, void *tmem_va,
pagesize_t len);
#define TMEM_PERF
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|