(This is for post-4.0, but I'm posting now for feedback.)
Add "page deduplication" capability to Xen-side of tmem.
(Transparent to tmem-enabled guests.) Ephemeral pages
that have the exact same content are "combined" so that only
one page frame is needed. Since ephemeral pages are essentially
read-only, no C-O-W (and thus no equivalent of swapping) is
necessary.
Anybody know of any good fast (SSE2?) assembly memory compare
routines? (See tmh_page_cmp() in the patch.)
Points of interest:
- Modifications to LRU eviction algorithm to accommodate
dedup'ed pages
- New data structures to allow lookup of matching pages
and track references. (Algorithm used is similar to
that used by KSM in KVM/Linux: No hashing required.)
- Lock (and rbtree) chosen by first byte of data to
allow reasonably high concurrency without greatly
complicating lock management.
- Statistics added so "dedup ratio" can be monitored.
- Dedup is disabled/enabled by Xen command line option.
I'm seeing 1.08-1.52 dedup ratio for two self-ballooned guests
simultaneously/continuously building linux; that's up to a
34% reduction in physical ephemeral pages used by tmem. Clearly
this is very workload-dependent. YMMV. To obtain this savings,
approx double the time is spent in tmem (increasing from
roughly 0.1% to roughly 0.2%). This compares favorably to
compression which costs approximately 10x for an approximate
savings of 50%.
Signed-off-by: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>
diff -r b8d2a4134a68 tools/misc/xen-tmem-list-parse.c
--- a/tools/misc/xen-tmem-list-parse.c Wed Mar 03 17:41:58 2010 +0000
+++ b/tools/misc/xen-tmem-list-parse.c Tue Mar 09 14:58:37 2010 -0700
@@ -110,13 +110,20 @@ void parse_global(char *s)
unsigned long long rtree_node_max = parse(s,"Nm");
unsigned long long pgp_count = parse(s,"Pc");
unsigned long long pgp_max = parse(s,"Pm");
+ unsigned long long page_count = parse(s,"Fc");
+ unsigned long long max_page_count = parse(s,"Fm");
+ unsigned long long pcd_count = parse(s,"Sc");
+ unsigned long long max_pcd_count = parse(s,"Sm");
printf("total tmem ops=%llu (errors=%llu) -- tmem pages avail=%llu\n",
total_ops, errored_ops, avail_pages);
printf("datastructs: objs=%llu (max=%llu) pgps=%llu (max=%llu) "
- "nodes=%llu (max=%llu)\n",
+ "nodes=%llu (max=%llu) pages=%llu (max=%llu) pcds=%llu (max=%llu) "
+ "dedup ratio=%f\n",
obj_count, obj_max, pgp_count, pgp_max,
- rtree_node_count, rtree_node_max);
+ rtree_node_count, rtree_node_max,
+ page_count,max_page_count,pcd_count,max_pcd_count,
+ (pcd_count==0)?0.0:(global_eph_count*1.0)/pcd_count);
printf("misc: failed_copies=%llu alloc_failed=%llu alloc_page_failed=%llu "
"low_mem=%llu evicted=%llu/%llu relinq=%llu/%llu, "
"max_evicts_per_relinq=%llu, flush_pools=%llu, "
diff -r b8d2a4134a68 xen/common/tmem.c
--- a/xen/common/tmem.c Wed Mar 03 17:41:58 2010 +0000
+++ b/xen/common/tmem.c Tue Mar 09 14:58:37 2010 -0700
@@ -79,6 +79,7 @@ static unsigned long low_on_memory = 0;
static unsigned long low_on_memory = 0;
static int global_obj_count_max = 0;
static int global_pgp_count_max = 0;
+static int global_pcd_count_max = 0;
static int global_page_count_max = 0;
static int global_rtree_node_count_max = 0;
static long global_eph_count_max = 0;
@@ -108,6 +109,7 @@ DECL_CYC_COUNTER(decompress);
struct tm_pool;
struct tmem_page_descriptor;
+struct tmem_page_content_descriptor;
struct client {
struct list_head client_list;
struct tm_pool *pools[MAX_POOLS_PER_DOMAIN];
@@ -219,12 +221,17 @@ struct tmem_page_descriptor {
obj_t *obj;
uint64_t inv_oid; /* used for invalid list only */
};
- uint32_t index;
size_t size; /* 0 == PAGE_SIZE (pfp), -1 == data invalid,
else compressed data (cdata) */
+ uint32_t index;
+ /* must hold pcd_tree_rwlocks[firstbyte] to use pcd pointer/siblings */
+ uint16_t firstbyte; /* NON_SHAREABLE->pfp otherwise->pcd */
+ bool_t eviction_attempted; /* CHANGE TO lifetimes? (settable) */
+ struct list_head pcd_siblings;
union {
pfp_t *pfp; /* page frame pointer */
char *cdata; /* compressed data */
+ struct tmem_page_content_descriptor *pcd; /* page dedup */
};
union {
uint64_t timestamp;
@@ -233,6 +240,16 @@ struct tmem_page_descriptor {
DECL_SENTINEL
};
typedef struct tmem_page_descriptor pgp_t;
+
+struct tmem_page_content_descriptor {
+ pfp_t *pfp;
+ struct list_head pgp_list;
+ struct rb_node pcd_rb_tree_node;
+ uint32_t count;
+};
+typedef struct tmem_page_content_descriptor pcd_t;
+struct rb_root pcd_tree_roots[256]; /* choose based on first byte of page */
+rwlock_t pcd_tree_rwlocks[256]; /* poor man's concurrency for now */
static LIST_HEAD(global_ephemeral_page_list); /* all pages in ephemeral pools
*/
@@ -267,6 +284,7 @@ static long global_eph_count = 0; /* ato
static long global_eph_count = 0; /* atomicity depends on eph_lists_spinlock */
static atomic_t global_obj_count = ATOMIC_INIT(0);
static atomic_t global_pgp_count = ATOMIC_INIT(0);
+static atomic_t global_pcd_count = ATOMIC_INIT(0);
static atomic_t global_page_count = ATOMIC_INIT(0);
static atomic_t global_rtree_node_count = ATOMIC_INIT(0);
@@ -336,6 +354,103 @@ static NOINLINE void tmem_page_free(pool
atomic_dec_and_assert(global_page_count);
}
+/************ PAGE CONTENT DESCRIPTOR MANIPULATION ROUTINES ***********/
+
+#define NOT_SHAREABLE ((uint16_t)-1UL)
+
+/* ensure pgp no longer points to pcd, nor vice-versa */
+/* take pcd rwlock unless have_pcd_rwlock is set, always unlock when done */
+static NOINLINE void pcd_disassociate(pgp_t *pgp, pool_t *pool, bool_t
have_pcd_rwlock)
+{
+ pcd_t *pcd = pgp->pcd;
+ pfp_t *pfp = pgp->pcd->pfp;
+ uint16_t firstbyte = pgp->firstbyte;
+
+ ASSERT(tmh_dedup_enabled());
+ ASSERT(firstbyte != NOT_SHAREABLE);
+ ASSERT(firstbyte < 256);
+ ASSERT(!pgp->size);
+
+ if ( have_pcd_rwlock )
+ ASSERT_WRITELOCK(&pcd_tree_rwlocks[firstbyte]);
+ else
+ tmem_write_lock(&pcd_tree_rwlocks[firstbyte]);
+ list_del_init(&pgp->pcd_siblings);
+ pgp->pcd = NULL;
+ pgp->firstbyte = NOT_SHAREABLE;
+ pgp->size = -1;
+ if ( --pcd->count )
+ {
+ tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
+ return;
+ }
+
+ /* no more references to this pcd, recycle it and the physical page */
+ ASSERT(list_empty(&pcd->pgp_list));
+ pcd->pfp = NULL;
+ /* remove pcd from rbtree */
+ rb_erase(&pcd->pcd_rb_tree_node,&pcd_tree_roots[firstbyte]);
+ /* reinit the struct for safety for now */
+ RB_CLEAR_NODE(&pcd->pcd_rb_tree_node);
+ /* now free up the pcd memory */
+ tmem_free(pcd,sizeof(pcd_t),NULL);
+ atomic_dec_and_assert(global_pcd_count);
+ tmem_page_free(pool,pfp);
+ tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
+}
+
+static NOINLINE int pcd_associate(pgp_t *pgp, uint8_t firstbyte)
+{
+ struct rb_node **new, *parent = NULL;
+ struct rb_root *root;
+ pcd_t *pcd;
+ int cmp;
+
+ if ( !tmh_dedup_enabled() )
+ return 0;
+ ASSERT(pgp->pfp != NULL);
+ ASSERT(pgp->obj != NULL);
+ ASSERT(pgp->obj->pool != NULL);
+ ASSERT(!pgp->obj->pool->persistent);
+ tmem_write_lock(&pcd_tree_rwlocks[firstbyte]);
+ /* look for page match */
+ root = &pcd_tree_roots[firstbyte];
+ new = &(root->rb_node);
+ while ( *new )
+ {
+ pcd = container_of(*new, pcd_t, pcd_rb_tree_node);
+ parent = *new;
+ if ( (cmp = tmh_page_cmp(pgp->pfp,pcd->pfp)) < 0 )
+ new = &((*new)->rb_left);
+ else if ( cmp > 0 )
+ new = &((*new)->rb_right);
+ else
+ {
+ /* match! free the no-longer-needed page frame from the pgp */
+ tmem_page_free(pgp->obj->pool,pgp->pfp);
+ goto match;
+ }
+ }
+ /* no match, alloc a pcd and put it in the tree */
+ if ( (pcd = tmem_malloc(pcd_t, NULL)) == NULL )
+ return -ENOMEM;
+ atomic_inc_and_max(global_pcd_count);
+ RB_CLEAR_NODE(&pcd->pcd_rb_tree_node); /* is this necessary */
+ INIT_LIST_HEAD(&pcd->pgp_list); /* is this necessary */
+ pcd->count = 0;
+ pcd->pfp = pgp->pfp;
+ rb_link_node(&pcd->pcd_rb_tree_node, parent, new);
+ rb_insert_color(&pcd->pcd_rb_tree_node, root);
+match:
+ pcd->count++;
+ list_add(&pgp->pcd_siblings,&pcd->pgp_list);
+ pgp->firstbyte = firstbyte;
+ pgp->eviction_attempted = 0;
+ pgp->pcd = pcd;
+ tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
+ return 0;
+}
+
/************ PAGE DESCRIPTOR MANIPULATION ROUTINES *******************/
/* allocate a pgp_t and associate it with an object */
@@ -353,6 +468,12 @@ static NOINLINE pgp_t *pgp_alloc(obj_t *
INIT_LIST_HEAD(&pgp->global_eph_pages);
INIT_LIST_HEAD(&pgp->client_eph_pages);
pgp->pfp = NULL;
+ if ( tmh_dedup_enabled() )
+ {
+ pgp->firstbyte = NOT_SHAREABLE;
+ pgp->eviction_attempted = 0;
+ INIT_LIST_HEAD(&pgp->pcd_siblings);
+ }
pgp->size = -1;
pgp->index = -1;
pgp->timestamp = get_cycles();
@@ -376,7 +497,9 @@ static NOINLINE void pgp_free_data(pgp_t
{
if ( pgp->pfp == NULL )
return;
- if ( !pgp->size )
+ if ( tmh_dedup_enabled() && pgp->firstbyte != NOT_SHAREABLE )
+ pcd_disassociate(pgp,pool,0);
+ else if ( !pgp->size )
tmem_page_free(pgp->obj->pool,pgp->pfp);
else
{
@@ -987,10 +1110,56 @@ static void client_freeze(client_t *clie
/************ MEMORY REVOCATION ROUTINES *******************************/
+static bool_t tmem_try_to_evict_pgp(pgp_t *pgp, bool_t *hold_pool_rwlock)
+{
+ obj_t *obj = pgp->obj;
+ pool_t *pool = obj->pool;
+ client_t *client = pool->client;
+ uint16_t firstbyte = pgp->firstbyte;
+
+ if ( pool->is_dying )
+ return 0;
+ if ( tmh_lock_all && !obj->no_evict )
+ return 1;
+ if ( tmem_spin_trylock(&obj->obj_spinlock) )
+ {
+ if ( tmh_dedup_enabled() )
+ {
+ firstbyte = pgp->firstbyte;
+ if ( firstbyte == NOT_SHAREABLE )
+ goto obj_unlock;
+ ASSERT(firstbyte < 256);
+ if ( !tmem_write_trylock(&pcd_tree_rwlocks[firstbyte]) )
+ goto obj_unlock;
+ if ( pgp->pcd->count > 1 && !pgp->eviction_attempted )
+ {
+ pgp->eviction_attempted++;
+ list_del(&pgp->global_eph_pages);
+
list_add_tail(&pgp->global_eph_pages,&global_ephemeral_page_list);
+ list_del(&pgp->client_eph_pages);
+
list_add_tail(&pgp->client_eph_pages,&client->ephemeral_page_list);
+ goto pcd_unlock;
+ }
+ }
+ if ( obj->pgp_count > 1 )
+ return 1;
+ if ( tmem_write_trylock(&pool->pool_rwlock) )
+ {
+ *hold_pool_rwlock = 1;
+ return 1;
+ }
+pcd_unlock:
+ tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
+obj_unlock:
+ tmem_spin_unlock(&obj->obj_spinlock);
+ }
+ return 0;
+}
+
static int tmem_evict(void)
{
client_t *client = tmh_client_from_current();
- pgp_t *pgp = NULL, *pgp_del;
+ pgp_t *pgp = NULL, *pgp2, *pgp_del;
obj_t *obj;
pool_t *pool;
int ret = 0;
@@ -1001,49 +1170,15 @@ static int tmem_evict(void)
if ( (client != NULL) && client_over_quota(client) &&
!list_empty(&client->ephemeral_page_list) )
{
- list_for_each_entry(pgp,&client->ephemeral_page_list,client_eph_pages)
- {
- obj = pgp->obj;
- pool = obj->pool;
- if ( pool->is_dying )
- continue;
- if ( tmh_lock_all && !obj->no_evict )
+
list_for_each_entry_safe(pgp,pgp2,&client->ephemeral_page_list,client_eph_pages)
+ if ( tmem_try_to_evict_pgp(pgp,&hold_pool_rwlock) )
goto found;
- if ( tmem_spin_trylock(&obj->obj_spinlock) )
- {
- if ( obj->pgp_count > 1 )
- goto found;
- if ( tmem_write_trylock(&pool->pool_rwlock) )
- {
- hold_pool_rwlock = 1;
- goto found;
- }
- tmem_spin_unlock(&obj->obj_spinlock);
- }
- }
} else if ( list_empty(&global_ephemeral_page_list) ) {
goto out;
} else {
- list_for_each_entry(pgp,&global_ephemeral_page_list,global_eph_pages)
- {
- obj = pgp->obj;
- pool = obj->pool;
- if ( pool->is_dying )
- continue;
- if ( tmh_lock_all && !obj->no_evict )
+
list_for_each_entry_safe(pgp,pgp2,&global_ephemeral_page_list,global_eph_pages)
+ if ( tmem_try_to_evict_pgp(pgp,&hold_pool_rwlock) )
goto found;
- if ( tmem_spin_trylock(&obj->obj_spinlock) )
- {
- if ( obj->pgp_count > 1 )
- goto found;
- if ( tmem_write_trylock(&pool->pool_rwlock) )
- {
- hold_pool_rwlock = 1;
- goto found;
- }
- tmem_spin_unlock(&obj->obj_spinlock);
- }
- }
}
ret = 0;
@@ -1057,10 +1192,16 @@ found:
ASSERT(obj->no_evict == 0);
ASSERT(obj->pool != NULL);
ASSERT_SENTINEL(obj,OBJ);
+ pool = obj->pool;
ASSERT_SPINLOCK(&obj->obj_spinlock);
pgp_del = pgp_delete_from_obj(obj, pgp->index);
ASSERT(pgp_del == pgp);
+ if ( tmh_dedup_enabled() && pgp->firstbyte != NOT_SHAREABLE )
+ {
+ ASSERT(pgp->pcd->count == 1 || pgp->eviction_attempted);
+ pcd_disassociate(pgp,pool,1);
+ }
pgp_delete(pgp,1);
if ( obj->pgp_count == 0 )
{
@@ -1197,6 +1338,11 @@ copy_uncompressed:
ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,0);
if ( ret == -EFAULT )
goto bad_copy;
+ if ( tmh_dedup_enabled() && !is_persistent(pool) )
+ {
+ if ( pcd_associate(pgp,tmh_get_first_byte(pgp->pfp)) == -ENOMEM )
+ goto failed_dup;
+ }
pgp->size = 0;
done:
@@ -1308,13 +1454,16 @@ copy_uncompressed:
copy_uncompressed:
if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
{
- ret == -ENOMEM;
+ ret = -ENOMEM;
goto delete_and_free;
}
/* tmh_copy_from_client properly handles len==0 (TMEM_NEW_PAGE) */
ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,cva);
if ( ret == -EFAULT )
goto bad_copy;
+ if ( tmh_dedup_enabled() && !is_persistent(pool) )
+ if ( pcd_associate(pgp,tmh_get_first_byte(pgp->pfp)) == -ENOMEM )
+ goto delete_and_free;
pgp->size = 0;
insert_page:
@@ -1411,6 +1560,19 @@ static NOINLINE int do_tmem_get(pool_t *
pgp->size, cva) == -EFAULT )
goto bad_copy;
END_CYC_COUNTER(decompress);
+ }
+ else if ( tmh_dedup_enabled() && !is_persistent(pool) &&
+ pgp->firstbyte != NOT_SHAREABLE )
+ {
+ uint16_t firstbyte = pgp->firstbyte;
+ int ret;
+
+ tmem_read_lock(&pcd_tree_rwlocks[firstbyte]);
+ ret = tmh_copy_to_client(cmfn, pgp->pcd->pfp, tmem_offset,
+ pfn_offset, len, cva);
+ tmem_read_unlock(&pcd_tree_rwlocks[firstbyte]);
+ if ( ret == -EFAULT )
+ goto bad_copy;
}
else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset,
pfn_offset, len, cva) == -EFAULT)
@@ -1855,11 +2017,14 @@ static int tmemc_list_global(tmem_cli_va
total_flush_pool, use_long ? ',' : '\n');
if (use_long)
n += scnprintf(info+n,BSIZE-n,
- "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d\n",
+ "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d,"
+ "Fc:%d,Fm:%d,Sc:%d,Sm:%d\n",
global_eph_count, global_eph_count_max,
_atomic_read(global_obj_count), global_obj_count_max,
_atomic_read(global_rtree_node_count), global_rtree_node_count_max,
- _atomic_read(global_pgp_count), global_pgp_count_max);
+ _atomic_read(global_pgp_count), global_pgp_count_max,
+ _atomic_read(global_page_count), global_page_count_max,
+ _atomic_read(global_pcd_count), global_pcd_count_max);
if ( sum + n >= len )
return sum;
tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
@@ -2569,10 +2734,18 @@ EXPORT void *tmem_relinquish_pages(unsig
/* called at hypervisor startup */
EXPORT void init_tmem(void)
{
+ int i;
if ( !tmh_enabled() )
return;
radix_tree_init();
+ if ( tmh_dedup_enabled() )
+ for (i = 0; i < 256; i++ )
+ {
+ pcd_tree_roots[i] = RB_ROOT;
+ rwlock_init(&pcd_tree_rwlocks[i]);
+ }
+
if ( tmh_init() )
{
printk("tmem: initialized comp=%d global-lock=%d\n",
diff -r b8d2a4134a68 xen/common/tmem_xen.c
--- a/xen/common/tmem_xen.c Wed Mar 03 17:41:58 2010 +0000
+++ b/xen/common/tmem_xen.c Tue Mar 09 14:58:37 2010 -0700
@@ -19,6 +19,9 @@ boolean_param("tmem", opt_tmem);
EXPORT int opt_tmem_compress = 0;
boolean_param("tmem_compress", opt_tmem_compress);
+
+EXPORT int opt_tmem_dedup = 1;
+boolean_param("tmem_dedup", opt_tmem_dedup);
EXPORT int opt_tmem_shared_auth = 0;
boolean_param("tmem_shared_auth", opt_tmem_shared_auth);
diff -r b8d2a4134a68 xen/include/xen/tmem_xen.h
--- a/xen/include/xen/tmem_xen.h Wed Mar 03 17:41:58 2010 +0000
+++ b/xen/include/xen/tmem_xen.h Tue Mar 09 14:58:37 2010 -0700
@@ -52,6 +52,12 @@ static inline int tmh_compression_enable
static inline int tmh_compression_enabled(void)
{
return opt_tmem_compress;
+}
+
+extern int opt_tmem_dedup;
+static inline int tmh_dedup_enabled(void)
+{
+ return opt_tmem_dedup;
}
extern int opt_tmem_shared_auth;
@@ -326,6 +332,28 @@ static inline bool_t tmh_current_is_priv
return IS_PRIV(current->domain);
}
+static inline uint8_t tmh_get_first_byte(pfp_t *pfp)
+{
+ void *p = __map_domain_page(pfp);
+
+ return (uint8_t)(*(char *)p);
+}
+
+static inline int tmh_page_cmp(pfp_t *pfp1, pfp_t *pfp2)
+{
+ const uint64_t *p1 = (uint64_t *)__map_domain_page(pfp1);
+ const uint64_t *p2 = (uint64_t *)__map_domain_page(pfp2);
+ int i;
+
+ // FIXME: code in assembly?
+ for ( i = PAGE_SIZE/sizeof(uint64_t); *p1 == *p2; i--, *p1++, *p2++ );
+ if ( !i )
+ return 0;
+ if ( *p1 < *p2 )
+ return -1;
+ return 1;
+}
+
/* these typedefs are in the public/tmem.h interface
typedef XEN_GUEST_HANDLE(void) cli_mfn_t;
typedef XEN_GUEST_HANDLE(char) cli_va_t;
tmem-dedup.patch
Description: Binary data
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|