[Xen-devel] [PATCH 11 of 11] blktap2: add remus driver

# HG changeset patch
# User Brendan Cully <brendan@xxxxxxxxx>
# Date 1252530408 25200
# Node ID aaf56934865afa3f5611cd891347953cdd1e5729
# Parent  5e0779189ef3b9382675b2e574b75936a6e9fb15
blktap2: add remus driver

Blktap2 port of remus disk driver. Backwards compatable with blktap1
implementation.

Signed-off-by: Ryan O'Connor <rjo@xxxxxxxxx>
Signed-off-by: Brendan Cully <brendan@xxxxxxxxx>

diff --git a/tools/blktap2/drivers/Makefile b/tools/blktap2/drivers/Makefile
--- a/tools/blktap2/drivers/Makefile
+++ b/tools/blktap2/drivers/Makefile
@@ -36,7 +36,7 @@
 CRYPT_LIB += -lcrypto
 endif
 
-LDFLAGS_img := $(CRYPT_LIB) -lpthread -lz
+LDFLAGS_img := $(CRYPT_LIB) -lpthread -lz -lm
 
 LIBS += -L$(LIBVHDDIR) -lvhd
 
@@ -44,6 +44,14 @@
 LIBS += -luuid
 endif
 
+REMUS-OBJS  := block-remus.o
+REMUS-OBJS  += hashtable.o
+REMUS-OBJS  += hashtable_itr.o
+REMUS-OBJS  += hashtable_utility.o
+
+$(REMUS-OBJS): CFLAGS += -fgnu89-inline -I$(XEN_XENSTORE)
+
+
 LIBAIO_DIR = $(XEN_ROOT)/tools/libaio/src
 tapdisk2 tapdisk-stream tapdisk-diff $(QCOW_UTIL): AIOLIBS := 
$(LIBAIO_DIR)/libaio.a
 tapdisk-client tapdisk-stream tapdisk-diff $(QCOW_UTIL): CFLAGS  += 
-I$(LIBAIO_DIR) -I$(XEN_LIBXC)
@@ -81,6 +89,7 @@
 BLK-OBJS-y  += block-qcow.o
 BLK-OBJS-y  += aes.o
 BLK-OBJS-y  += $(PORTABLE-OBJS-y)
+BLK-OBJS-y  += $(REMUS-OBJS)
 
 all: $(IBIN) lock-util qcow-util
 
diff --git a/tools/blktap2/drivers/block-remus.c 
b/tools/blktap2/drivers/block-remus.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap2/drivers/block-remus.c
@@ -0,0 +1,1670 @@
+/* block-remus.c
+ *
+ * This disk sends all writes to a backup via a network interface before
+ * passing them to an underlying device.
+ * The backup is a bit more complicated:
+ *  1. It applies all incoming writes to a ramdisk.
+ *  2. When a checkpoint request arrives, it moves the ramdisk to
+ *     a committing state and uses a new ramdisk for subsequent writes.
+ *     It also acknowledges the request, to let the sender know it can
+ *     release output.
+ *  3. The ramdisk flushes its contents to the underlying driver.
+ *  4. At failover, the backup waits for the in-flight ramdisk (if any) to
+ *     drain before letting the domain be activated.
+ *
+ * The driver determines whether it is the client or server by attempting
+ * to bind to the replication address. If the address is not local,
+ * the driver acts as client.
+ *
+ * The following messages are defined for the replication stream:
+ * 1. write request
+ *    "wreq"      4
+ *    num_sectors 4
+ *    sector      8
+ *    buffer      (num_sectors * sector_size)
+ * 2. submit request (may be used as a barrier
+ *    "sreq"      4
+ * 3. commit request
+ *    "creq"      4
+ * After a commit request, the client must wait for a competion message:
+ * 4. completion
+ *    "done"      4
+ */
+
+/* due to architectural choices in tapdisk, block-buffer is forced to
+ * reimplement some code which is meant to be private */
+#define TAPDISK
+#include "tapdisk.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+#include "hashtable.h"
+#include "hashtable_itr.h"
+#include "hashtable_utility.h"
+
+#include <errno.h>
+#include <inttypes.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <unistd.h>
+
+/* timeout for reads and writes in ms */
+#define NET_TIMEOUT 500
+#define RAMDISK_HASHSIZE 128
+
+/* connect retry timeout (seconds) */
+#define REMUS_CONNRETRY_TIMEOUT 10
+
+#define RPRINTF(_f, _a...) syslog (LOG_DEBUG, "remus: " _f, ## _a)
+
+enum tdremus_mode {
+  mode_invalid = 0,
+  mode_unprotected,
+  mode_primary,
+  mode_backup
+};
+
+struct tdremus_req {
+  uint64_t sector;
+  int nb_sectors;
+  char buf[4096];
+};
+
+struct req_ring {
+  /* waste one slot to distinguish between empty and full */
+  struct tdremus_req requests[MAX_REQUESTS * 2 + 1];
+  unsigned int head;
+  unsigned int tail;
+};
+
+/* TODO: This isn't very pretty, but to properly generate our own treqs (needed
+ * by the backup) we need to know our td_vbt_t and td_image_t (blktap2
+ * internals). As a proper fix, we should consider extending the tapdisk
+ * interface with a td_create_request() function, or something similar.
+ *
+ * For now, we just grab the vbd in the td_open() command, and the td_image_t
+ * from the first read request.
+ */
+td_vbd_t *device_vbd = NULL;
+td_image_t *remus_image = NULL;
+
+struct ramdisk {
+  size_t sector_size;
+  struct hashtable* h;
+  /* when a ramdisk is flushed, h is given a new empty hash for writes
+   * while the old ramdisk (prev) is drained asynchronously. To avoid
+   * a race where a read request points to a sector in prev which has
+   * not yet been flushed, check prev on a miss in h */
+  struct hashtable* prev;
+  /* count of outstanding requests to the base driver */
+  size_t inflight;
+};
+
+/* the ramdisk intercepts the original callback for reads and writes.
+ * This holds the original data. */
+/* Might be worth making this a static array in struct ramdisk to avoid
+ * a malloc per request */
+
+struct tdremus_state;
+
+struct ramdisk_cbdata {
+  td_callback_t cb;
+  void* private;
+  char* buf;
+  struct tdremus_state* state;
+};
+
+struct ramdisk_write_cbdata {
+  struct tdremus_state* state;
+  char* buf;
+};
+
+typedef void (*queue_rw_t) (td_driver_t *driver, td_request_t treq);
+
+/* poll_fd type for blktap2 fd system. taken from block_log.c */
+typedef struct poll_fd {
+  int        fd;
+  event_id_t id;
+} poll_fd_t;
+
+struct tdremus_state {
+//  struct tap_disk* driver;
+  void* driver_data;
+
+  /* XXX: this is needed so that the server can perform operations on
+   * the driver from the stream_fd event handler. fix this. */
+  td_driver_t *tdremus_driver;
+
+  /* TODO: we may wish to replace these two FIFOs with a unix socket */
+  char*     ctl_path; /* receive flush instruction here */
+  poll_fd_t ctl_fd;     /* io_fd slot for control FIFO */
+  char*     msg_path; /* output completion message here */
+  poll_fd_t msg_fd;
+
+  /* replication host */
+  struct sockaddr_in sa;
+  poll_fd_t server_fd;    /* server listen port */
+  poll_fd_t stream_fd;     /* replication channel */
+
+  /* queue write requests, batch-replicate at submit */
+  struct req_ring write_ring;
+
+  /* ramdisk data*/
+  struct ramdisk ramdisk;
+
+  /* mode methods */
+  enum tdremus_mode mode;
+  int (*queue_flush)(td_driver_t *driver);
+};
+
+typedef struct tdremus_wire {
+  uint32_t op;
+  uint64_t id;
+  uint64_t sec;
+  uint32_t secs;
+} tdremus_wire_t;
+
+#define TDREMUS_READ "rreq"
+#define TDREMUS_WRITE "wreq"
+#define TDREMUS_SUBMIT "sreq"
+#define TDREMUS_COMMIT "creq"
+#define TDREMUS_DONE "done"
+#define TDREMUS_FAIL "fail"
+
+/* primary read/write functions */
+static void primary_queue_read(td_driver_t *driver, td_request_t treq);
+static void primary_queue_write(td_driver_t *driver, td_request_t treq);
+
+/* backup read/write functions */
+static void backup_queue_read(td_driver_t *driver, td_request_t treq);
+static void backup_queue_write(td_driver_t *driver, td_request_t treq);
+
+/* unpritected read/write functions */
+static void unprotected_queue_read(td_driver_t *driver, td_request_t treq);
+static void unprotected_queue_write(td_driver_t *driver, td_request_t treq);
+
+static int tdremus_close(td_driver_t *driver);
+
+static int switch_mode(td_driver_t *driver, enum tdremus_mode mode);
+static int ctl_respond(struct tdremus_state *s, const char *response);
+
+/* ring functions */
+static inline unsigned int ring_next(struct req_ring* ring, unsigned int pos)
+{
+  if (++pos >= MAX_REQUESTS * 2 + 1)
+    return 0;
+
+  return pos;
+}
+
+static inline int ring_isempty(struct req_ring* ring)
+{
+  return ring->head == ring->tail;
+}
+
+static inline int ring_isfull(struct req_ring* ring)
+{
+  return ring_next(ring, ring->tail) == ring->head;
+}
+
+/* functions to create and sumbit treq's */
+
+static void
+replicated_write_callback(td_request_t treq, int err)
+{
+  struct tdremus_state *s = (struct tdremus_state *) treq.cb_data;
+  td_vbd_request_t *vreq;
+
+  vreq = (td_vbd_request_t *) treq.private;
+
+  /* the write failed for now, lets panic. this is very bad */
+  if (err) {
+    RPRINTF("ramdisk write failed, disk image is not consistent\n");
+    exit(-1);
+  }
+
+  /* The write succeeded. let's pull the vreq off whatever request list
+   * it is on and free() it */
+  list_del(&vreq->next);
+  free(vreq);
+
+  s->ramdisk.inflight--;
+  if (!s->ramdisk.inflight && !s->ramdisk.prev) {
+    /* TODO: the ramdisk has been flushed */
+  }
+}
+
+static inline int
+create_write_request(struct tdremus_state *state, td_sector_t sec, int secs, 
char *buf)
+{
+  td_request_t treq;
+  td_vbd_request_t *vreq;
+  
+  treq.op      = TD_OP_WRITE;
+  treq.buf     = buf;
+  treq.sec     = sec;
+  treq.secs    = secs;
+  treq.image   = remus_image;
+  treq.cb      = replicated_write_callback;
+  treq.cb_data = state;
+  treq.id      = 0;
+  treq.sidx    = 0;
+
+  vreq         = calloc(1, sizeof(td_vbd_request_t));
+  treq.private = vreq;
+
+  if(!vreq)
+    return -1;
+  
+  vreq->submitting = 1;
+  INIT_LIST_HEAD(&vreq->next);
+  tapdisk_vbd_move_request(treq.private, &device_vbd->pending_requests);
+
+  /* TODO:
+   * we should probably leave it up to the caller to forward the request */
+  td_forward_request(treq);
+
+  vreq->submitting--;
+ 
+  return 0; 
+}
+
+
+/* ramdisk methods */
+static int ramdisk_flush(td_driver_t *driver, struct tdremus_state *s);
+
+/* http://www.concentric.net/~Ttwang/tech/inthash.htm */
+static unsigned int uint64_hash(void* k)
+{
+  uint64_t key = *(uint64_t*)k;
+
+  key = (~key) + (key << 18);
+  key = key ^ (key >> 31);
+  key = key * 21;
+  key = key ^ (key >> 11);
+  key = key + (key << 6);
+  key = key ^ (key >> 22);
+  
+  return (unsigned int)key;
+}
+
+static int rd_hash_equal(void* k1, void* k2)
+{
+  uint64_t key1, key2;
+
+  key1 = *(uint64_t*)k1;
+  key2 = *(uint64_t*)k2;
+
+  return key1 == key2;
+}
+
+static int ramdisk_read(struct ramdisk* ramdisk, uint64_t sector,
+                       int nb_sectors, char* buf)
+{
+  int i;
+  char* v;
+  uint64_t key;
+
+  for (i = 0; i < nb_sectors; i++) {
+    key = sector + i;
+    if (!(v = hashtable_search(ramdisk->h, &key))) {
+      /* check whether it is queued in a previous flush request */
+      if (!(ramdisk->prev && (v = hashtable_search(ramdisk->prev, &key))))
+       return -1;
+    }
+    memcpy(buf + i * ramdisk->sector_size, v, ramdisk->sector_size);
+  }
+
+  return 0;
+}
+
+static int ramdisk_write_hash(struct hashtable* h, uint64_t sector, char* buf,
+                             size_t len)
+{
+  char* v;
+  uint64_t* key;
+  
+  if ((v = hashtable_search(h, &sector))) {
+    memcpy(v, buf, len);
+    return 0;
+  }
+
+  if (!(v = malloc(len))) {
+    DPRINTF("ramdisk_write_hash: malloc failed\n");
+    return -1;
+  }
+  memcpy(v, buf, len);
+  if (!(key = malloc(sizeof(*key)))) {
+    DPRINTF("ramdisk_write_hash: error allocating key\n");
+    free(v);
+    return -1;
+  }
+  *key = sector;
+  if (!hashtable_insert(h, key, v)) {
+    DPRINTF("ramdisk_write_hash failed on sector %" PRIu64 "\n", sector);
+    free(key);
+    free(v);
+    return -1;
+  }
+
+  return 0;
+}
+
+static inline int ramdisk_write(struct ramdisk* ramdisk, uint64_t sector,
+                               int nb_sectors, char* buf)
+{
+  int i, rc;
+
+  for (i = 0; i < nb_sectors; i++) {
+    rc = ramdisk_write_hash(ramdisk->h, sector + i,
+                           buf + i * ramdisk->sector_size,
+                           ramdisk->sector_size);
+    if (rc)
+      return rc;
+  }
+
+  return 0;
+}
+
+static int ramdisk_write_cb(td_driver_t *driver, int res, uint64_t sector,
+                           int nb_sectors, int id, void* private)
+{
+  struct ramdisk_write_cbdata *cbdata = (struct ramdisk_write_cbdata*)private;
+  struct tdremus_state *s = cbdata->state;
+  int rc;
+
+  /*
+  RPRINTF("ramdisk write callback: rc %d, %d sectors @ %" PRIu64 "\n", res, 
nb_sectors,
+         sector);
+  */
+
+  free(cbdata->buf);
+  free(cbdata);
+
+  s->ramdisk.inflight--;
+  if (!s->ramdisk.inflight && !s->ramdisk.prev) {
+    /* when this reaches 0 and prev is empty, the disk is flushed. */
+    /*
+    RPRINTF("ramdisk flush complete\n");
+    */
+  }
+
+  if (s->ramdisk.prev) {
+    /* resubmit as much as possible in the remaining disk */
+    /*
+    RPRINTF("calling ramdisk_flush from write callback\n");
+    */
+    return ramdisk_flush(driver, s);
+  }
+
+  return 0;
+}
+
+static int uint64_compare(const void* k1, const void* k2)
+{
+  uint64_t u1 = *(uint64_t*)k1;
+  uint64_t u2 = *(uint64_t*)k2;
+
+  /* u1 - u2 is unsigned */
+  return u1 < u2 ? -1 : u1 > u2 ? 1 : 0;
+}
+
+/* set psectors to an array of the sector numbers in the hash, returning
+ * the number of entries (or -1 on error) */
+static int ramdisk_get_sectors(struct hashtable* h, uint64_t** psectors)
+{
+  struct hashtable_itr* itr;
+  uint64_t* sectors;
+  int count;
+
+  if (!(count = hashtable_count(h)))
+    return 0;
+
+  if (!(*psectors = malloc(count * sizeof(uint64_t)))) {
+    DPRINTF("ramdisk_get_sectors: error allocating sector map\n");
+    return -1;
+  }
+  sectors = *psectors;
+
+  itr = hashtable_iterator(h);
+  count = 0;
+  do {
+    sectors[count++] = *(uint64_t*)hashtable_iterator_key(itr);
+  } while (hashtable_iterator_advance(itr));
+  free(itr);
+
+  return count;
+}
+
+static char* merge_requests(struct ramdisk* ramdisk, uint64_t start,
+                           size_t count)
+{
+  char* buf;
+  char* sector;
+  int i;
+
+  if (!(buf = valloc(count * ramdisk->sector_size))) {
+    DPRINTF("merge_request: allocation failed\n");
+    return NULL;
+  }
+
+  for (i = 0; i < count; i++) {
+    if (!(sector = hashtable_search(ramdisk->prev, &start))) {
+      DPRINTF("merge_request: lookup failed on %"PRIu64"\n", start);
+      return NULL;
+    }
+
+    memcpy(buf + i * ramdisk->sector_size, sector, ramdisk->sector_size);
+    free(sector);
+
+    start++;
+  }
+
+  return buf;
+}
+
+/* The underlying driver may not handle having the whole ramdisk queued at
+ * once. We queue what we can and let the callbacks attempt to queue more. */
+/* NOTE: may be called from callback, while dd->private still belongs to
+ * the underlying driver */
+static int ramdisk_flush(td_driver_t *driver, struct tdremus_state* s)
+{
+  uint64_t* sectors;
+  char* buf;
+  uint64_t base, batchlen;
+  int i, j, count = 0;
+
+  // RPRINTF("ramdisk flush\n");
+
+  if ((count = ramdisk_get_sectors(s->ramdisk.prev, &sectors)) <= 0)
+    return count;
+
+  /*
+  RPRINTF("ramdisk: flushing %d sectors\n", count);
+  */
+  
+  /* sort and merge sectors to improve disk performance */
+  qsort(sectors, count, sizeof(*sectors), uint64_compare);
+
+  for (i = 0; i < count;) {
+    base = sectors[i++];
+    while (i < count && sectors[i] == sectors[i-1] + 1)
+      i++;
+    batchlen = sectors[i-1] - base + 1;
+
+    if (!(buf = merge_requests(&s->ramdisk, base, batchlen))) {
+      RPRINTF("ramdisk_flush: merge_requests failed\n");
+      free(sectors);
+      return -1;
+    }
+
+    /* NOTE: create_write_request() creates a treq AND forwards it down
+     * the driver chain */
+    // RPRINTF("forwarding write request at %" PRIu64 ", length: %" PRIu64 
"\n", base, batchlen);
+    create_write_request(s, base, batchlen, buf);
+    //RPRINTF("write request at %" PRIu64 ", length: %" PRIu64 " forwarded\n", 
base, batchlen);
+
+    s->ramdisk.inflight++;
+    
+    for (j = 0; j < batchlen; j++) {
+      hashtable_remove(s->ramdisk.prev, &base);
+      base++;
+    }
+  }
+
+  if (!hashtable_count(s->ramdisk.prev)) {
+    /* everything is in flight */
+    hashtable_destroy(s->ramdisk.prev, 0);
+    s->ramdisk.prev = NULL;
+  }
+  
+  free(sectors);
+
+  // RPRINTF("ramdisk flush done\n");
+  return 0;
+}
+
+/* flush ramdisk contents to disk */
+static int ramdisk_start_flush(td_driver_t *driver)
+{
+  struct tdremus_state *s = (struct tdremus_state *)driver->data;
+  uint64_t* key;
+  char* buf;
+  int rc = 0;
+  int i, j, count, batchlen;
+  uint64_t* sectors;
+
+  if (!hashtable_count(s->ramdisk.h)) {
+    /*
+    RPRINTF("Nothing to flush\n");
+    */
+    return 0;
+  }
+
+  if (s->ramdisk.prev) {
+    /* a flush request issued while a previous flush is still in progress
+     * will merge with the previous request. If you want the previous
+     * request to be consistent, wait for it to complete. */
+    if ((count = ramdisk_get_sectors(s->ramdisk.h, &sectors)) < 0)
+      return count;
+
+    for (i = 0; i < count; i++) {
+      buf = hashtable_search(s->ramdisk.h, sectors + i);
+      ramdisk_write_hash(s->ramdisk.prev, sectors[i], buf,
+                        s->ramdisk.sector_size);
+    }
+    free(sectors);
+
+    hashtable_destroy (s->ramdisk.h, 0);
+  } else
+    s->ramdisk.prev = s->ramdisk.h;
+
+  /* We create a new hashtable so that new writes can be performed before
+   * the old hashtable is completely drained. */
+  s->ramdisk.h = create_hashtable(RAMDISK_HASHSIZE, uint64_hash,
+                                 rd_hash_equal);
+
+  return ramdisk_flush(driver, s);
+}
+
+
+static int ramdisk_start(td_driver_t *driver)
+{
+  struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+  if (s->ramdisk.h) {
+    RPRINTF("ramdisk already allocated\n");
+    return 0;
+  }
+
+  s->ramdisk.sector_size = driver->info.sector_size;
+  s->ramdisk.h = create_hashtable(RAMDISK_HASHSIZE, uint64_hash,
+                                 rd_hash_equal);
+
+  DPRINTF("Ramdisk started, %zu bytes/sector\n", s->ramdisk.sector_size);
+  
+  return 0;
+}
+
+/* common client/server functions */
+/* mayberead: Time out after a certain interval. */
+static int mread(int fd, void* buf, size_t len)
+{
+  fd_set rfds;
+  int rc;
+  size_t cur = 0;
+  struct timeval tv = {
+    .tv_sec = 0,
+    .tv_usec = NET_TIMEOUT * 1000
+  };
+
+  if (!len)
+    return 0;
+
+  /* read first. Only select if read is incomplete. */
+  rc = read(fd, buf, len);
+  while (rc < 0 || cur + rc < len) {
+    if (!rc) {
+      RPRINTF("end-of-file");
+      return -1;
+    }
+    if (rc < 0 && errno != EAGAIN) {
+       RPRINTF("error during read: %s\n", strerror(errno));
+       return -1;
+    }
+    if (rc > 0)
+      cur += rc;
+
+    FD_ZERO(&rfds);
+    FD_SET(fd, &rfds);
+    if (!(rc = select(fd + 1, &rfds, NULL, NULL, &tv))) {
+      RPRINTF("time out during read\n");
+      return -1;
+    } else if (rc < 0) {
+      RPRINTF("error during select: %d\n", errno);
+      return -1;
+    }
+    rc = read(fd, buf + cur, len - cur);
+  }
+  /*
+  RPRINTF("read %d bytes\n", cur + rc);
+  */
+
+  return 0;
+}
+
+static int mwrite(int fd, void* buf, size_t len)
+{
+  fd_set wfds;
+  size_t cur = 0;
+  int rc;
+  struct timeval tv = {
+    .tv_sec = 0,
+    .tv_usec = NET_TIMEOUT * 1000 
+  };
+
+  if (!len)
+    return 0;
+
+  /* read first. Only select if read is incomplete. */
+  rc = write(fd, buf, len);
+  while (rc < 0 || cur + rc < len) {
+    if (!rc) {
+      RPRINTF("end-of-file");
+      return -1;
+    }
+    if (rc < 0 && errno != EAGAIN) {
+       RPRINTF("error during write: %s\n", strerror(errno));
+       return -1;
+    }
+    if (rc > 0)
+      cur += rc;
+
+    FD_ZERO(&wfds);
+    FD_SET(fd, &wfds);
+    if (!(rc = select(fd + 1, NULL, &wfds, NULL, &tv))) {
+      RPRINTF("time out during write\n");
+      return -1;
+    } else if (rc < 0) {
+      RPRINTF("error during select: %d\n", errno);
+      return -1;
+    }
+    rc = write(fd, buf + cur, len - cur);
+  }
+  /*
+  RPRINTF("wrote %d bytes\n", cur + rc);
+  */
+
+  return 0;
+  FD_ZERO(&wfds);
+  FD_SET(fd, &wfds);
+  select(fd + 1, NULL, &wfds, NULL, &tv);
+}
+
+
+static void inline close_stream_fd(struct tdremus_state *s)
+{
+    /* XXX: -2 is magic. replace with macro perhaps? */
+    tapdisk_server_unregister_event(s->stream_fd.id);
+    close(s->stream_fd.fd);
+    s->stream_fd.fd = -2;
+}
+
+/* primary functions */
+static void remus_client_event(event_id_t, char mode, void *private);
+static void remus_connect_event(event_id_t id, char mode, void *private);
+static void remus_retry_connect_event(event_id_t id, char mode, void *private);
+
+static int primary_do_connect(struct tdremus_state *state)
+{
+  event_id_t id;
+  int fd;
+  int rc;
+  int flags;
+
+  RPRINTF("client connecting to %s:%d...\n", inet_ntoa(state->sa.sin_addr), 
ntohs(state->sa.sin_port));
+
+  if ((fd = socket(PF_INET, SOCK_STREAM, 0)) < 0) {
+    RPRINTF("could not create client socket: %d\n", errno);
+    return -1;
+  }
+
+  /* make socket nonblocking */
+  if ((flags = fcntl(fd, F_GETFL, 0)) == -1)
+    flags = 0;
+  if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == -1)
+    return -1;
+
+  /* once we have created the socket and populated the address, we can now 
start
+   * our non-blocking connect. rather than duplicating code we trigger a 
timeout
+   * on the socket fd, which calls out nonblocking connect code
+   */
+  if((id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT, fd, 0, 
remus_retry_connect_event, state)) < 0) {
+    RPRINTF("error registering timeout client connection event handler: %s\n", 
strerror(id));
+    /* TODO: we leak a fd here */
+    return -1;
+  }
+  state->stream_fd.fd = fd;
+  state->stream_fd.id = id;
+  return 0;
+}
+
+static int primary_blocking_connect(struct tdremus_state *state)
+{
+  int fd;
+  int id;
+  int rc;
+  int flags;
+
+  RPRINTF("client connecting to %s:%d...\n", inet_ntoa(state->sa.sin_addr), 
ntohs(state->sa.sin_port));
+
+  if ((fd = socket(PF_INET, SOCK_STREAM, 0)) < 0) {
+    RPRINTF("could not create client socket: %d\n", errno);
+    return -1;
+  }
+
+  do {
+    if ((rc = connect(fd, &state->sa, sizeof(state->sa))) < 0) {
+      if (errno == ECONNREFUSED) {
+        RPRINTF("connection refused -- retrying in 1 second\n");
+        sleep(1);
+      } else {
+        RPRINTF("connection failed: %d\n", errno);
+        close(fd);
+        return -1;
+      }
+    }
+  } while (rc < 0);
+
+  RPRINTF("client connected\n");
+
+  /* make socket nonblocking */
+  if ((flags = fcntl(fd, F_GETFL, 0)) == -1)
+    flags = 0;
+  if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == -1)
+  {
+    RPRINTF("error making socket nonblocking\n");
+    close(fd);
+    return -1;
+  }
+
+  if((id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, fd, 0, 
remus_client_event, state)) < 0) {
+    RPRINTF("error registering client event handler: %s\n", strerror(id));
+    close(fd);
+    return -1;
+  }
+
+  state->stream_fd.fd = fd;
+  state->stream_fd.id = id;
+  return 0;
+}
+
+/* on read, just pass request through */
+static void primary_queue_read(td_driver_t *driver, td_request_t treq)
+{
+  /* just pass read through */
+  td_forward_request(treq);
+}
+
+/* TODO:
+ * The primary uses mwrite() to write the contents of a write request to the
+ * backup. This effectively blocks until all data has been copied into a system
+ * buffer or a timeout has occured. We may wish to instead use tapdisk's
+ * nonblocking i/o interface, tapdisk_server_register_event(), to set timeouts
+ * and write data in an asynchronous fashion.
+ */
+static void primary_queue_write(td_driver_t *driver, td_request_t treq)
+{
+    struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+    char header[sizeof(uint32_t) + sizeof(uint64_t)];
+    uint32_t *sectors = (uint32_t *)header;
+    uint64_t *sector = (uint64_t *)(header + sizeof(uint32_t));
+
+    // RPRINTF("write: stream_fd.fd: %d\n", s->stream_fd.fd);
+  
+    /* -1 means we haven't connected yet, -2 means the connection was lost */
+    if(s->stream_fd.fd == -1) {
+       RPRINTF("connecting to backup...\n");
+       primary_blocking_connect(s);
+    }
+
+    *sectors = treq.secs;
+    *sector = treq.sec;
+
+    if (mwrite(s->stream_fd.fd, TDREMUS_WRITE, strlen(TDREMUS_WRITE)) < 0)
+       goto fail;
+    if (mwrite(s->stream_fd.fd, header, sizeof(header)) < 0)
+       goto fail;
+  
+    if (mwrite(s->stream_fd.fd, treq.buf, treq.secs * 
driver->info.sector_size) < 0)
+       goto fail;
+
+    td_forward_request(treq);
+
+    return;
+
+  fail:
+    /* switch to unprotected mode and tell tapdisk to retry */
+    RPRINTF("write request replication failed, switching to unprotected mode");
+    switch_mode(s->tdremus_driver, mode_unprotected);
+    td_complete_request(treq, -EBUSY);
+}
+
+
+static int client_flush(td_driver_t *driver)
+{
+    struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+    // RPRINTF("committing output\n");
+
+    if (s->stream_fd.fd == -1)
+       /* connection not yet established, nothing to flush */
+       return 0;
+
+    if (mwrite(s->stream_fd.fd, TDREMUS_COMMIT, strlen(TDREMUS_COMMIT)) < 0) {
+       RPRINTF("error flushing output");
+       close_stream_fd(s);
+       return -1;
+    }
+
+    return 0;
+}
+
+static int primary_start(td_driver_t *driver)
+{
+  struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+  RPRINTF("activating client mode\n");
+
+  tapdisk_remus.td_queue_read = primary_queue_read;
+  tapdisk_remus.td_queue_write = primary_queue_write;
+  s->queue_flush = client_flush;
+
+  s->stream_fd.fd = -1;
+  s->stream_fd.id = -1;
+
+  return 0;
+}
+
+/* timeout callback */
+static void remus_retry_connect_event(event_id_t id, char mode, void *private)
+{
+  struct tdremus_state *s = (struct tdremus_state *)private;
+
+  /* do a non-blocking connect */
+  if (connect(s->stream_fd.fd, &s->sa, sizeof(s->sa)) && errno != EINPROGRESS) 
{
+    if(errno == ECONNREFUSED || errno == ENETUNREACH || errno == EAGAIN || 
errno == ECONNABORTED)
+    { 
+      /* try again in a second */
+      tapdisk_server_unregister_event(s->stream_fd.id);
+      if((id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT, 
s->stream_fd.fd, REMUS_CONNRETRY_TIMEOUT, remus_retry_connect_event, s)) < 0) {
+        RPRINTF("error registering timeout client connection event handler: 
%s\n", strerror(id));
+        return;
+      }
+      s->stream_fd.id = id;
+    }
+    else
+    { 
+      /* not recoverable */
+      RPRINTF("error connection to server %s\n", strerror(errno));
+      return;
+    }
+  }
+  else
+  {
+    /* the connect returned EINPROGRESS (nonblocking connect) we must wait for 
the fd to be writeable to determine if the connect worked */
+
+    tapdisk_server_unregister_event(s->stream_fd.id);
+    if((id = tapdisk_server_register_event(SCHEDULER_POLL_WRITE_FD, 
s->stream_fd.fd, 0, remus_connect_event, s)) < 0) {
+      RPRINTF("error registering client connection event handler: %s\n", 
strerror(id));
+      return;
+    }
+    s->stream_fd.id = id;
+  }
+}
+
+/* callback when nonblocking connect() is finished */
+/* called only by primary in unprotected state */
+static void remus_connect_event(event_id_t id, char mode, void *private)
+{
+  int socket_errno;
+  socklen_t socket_errno_size;
+  struct tdremus_state *s = (struct tdremus_state *)private;
+
+  /* check to se if the connect succeeded */
+  socket_errno_size = sizeof(socket_errno);
+  if (getsockopt(s->stream_fd.fd, SOL_SOCKET, SO_ERROR, &socket_errno, 
&socket_errno_size)) {
+    RPRINTF("error getting socket errno\n");
+    return;
+  }
+
+  RPRINTF("socket connect returned %d\n", socket_errno);
+ 
+  if(socket_errno)
+  {
+    /* the connect did not succeed */
+    
+    if(socket_errno == ECONNREFUSED || socket_errno == ENETUNREACH || 
socket_errno == ETIMEDOUT
+      || socket_errno == ECONNABORTED || socket_errno == EAGAIN)
+    {
+      /* we can probably assume that the backup is down. just try again later 
*/
+      tapdisk_server_unregister_event(s->stream_fd.id);
+      if((id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT, 
s->stream_fd.fd, REMUS_CONNRETRY_TIMEOUT, remus_retry_connect_event, s)) < 0) {
+        RPRINTF("error registering timeout client connection event handler: 
%s\n", strerror(id));
+        return;
+      }
+      s->stream_fd.id = id;
+    }
+    else
+    {
+      RPRINTF("socket connect returned %d, giving up\n", socket_errno);
+    }
+  }
+  else
+  {
+    /* the connect succeeded */
+
+    /* unregister this function and register a new event handler */
+    tapdisk_server_unregister_event(s->stream_fd.id);
+    if((id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, 
s->stream_fd.fd, 0, remus_client_event, s)) < 0) {
+      RPRINTF("error registering client event handler: %s\n", strerror(id));
+      return;
+    }
+    s->stream_fd.id = id;
+
+    /* switch from unprotected to protected client */
+    switch_mode(s->tdremus_driver, mode_primary);
+  }
+}
+
+
+/* we install this event handler on the primary once we have connected to the 
backup */
+/* wait for "done" message to commit checkpoint */
+static void remus_client_event(event_id_t id, char mode, void *private)
+{
+  struct tdremus_state *s = (struct tdremus_state *)private;
+  char req[5]; 
+  int rc;
+
+  if (mread(s->stream_fd.fd, req, sizeof(req) - 1) < 0) {
+      /* replication stream closed or otherwise broken (timeout, reset, &c) */
+      RPRINTF("error reading from backup\n");
+      close_stream_fd(s);
+      return;
+  }
+
+  req[4] = '\0';
+
+  if (!strcmp(req, TDREMUS_DONE))
+      /* checkpoint committed, inform msg_fd */
+      ctl_respond(s, TDREMUS_DONE);
+  else {
+    RPRINTF("received unknown message: %s\n", req);
+    close_stream_fd(s);
+  }
+
+  return;
+}
+
+/* backup functions */
+static void remus_server_event(event_id_t id, char mode, void *private);
+
+/* returns the socket that receives write requests */
+static void remus_server_accept(event_id_t id, char mode, void* private)
+{
+  struct tdremus_state* s = (struct tdremus_state *) private;
+
+  int stream_fd;
+  event_id_t cid;
+
+  /* XXX: add address-based black/white list */
+  if ((stream_fd = accept(s->server_fd.fd, NULL, NULL)) < 0) {
+    RPRINTF("error accepting connection: %d\n", errno);
+    return;
+  }
+
+  /* TODO: check to see if we are already replicating. if so just close the
+   * connection (or do something smarter) */
+  RPRINTF("server accepted connection\n");
+
+  /* add tapdisk event for replication stream */
+  cid = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, stream_fd, 0,
+                                      remus_server_event, s);
+
+  if(cid < 0) {
+    RPRINTF("error registering connection event handler: %s\n", 
strerror(errno));
+    close(stream_fd);
+    return;
+  }
+
+  /* store replication file descriptor */
+  s->stream_fd.fd = stream_fd;
+  s->stream_fd.id = cid;
+}
+
+/* returns -2 if EADDRNOTAVAIL */
+static int remus_bind(struct tdremus_state* s)
+{
+//  struct sockaddr_in sa;
+  int opt;
+  int rc = -1;
+
+  if ((s->server_fd.fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+    RPRINTF("could not create server socket: %d\n", errno);
+    return rc;
+  }
+  opt = 1;
+  if (setsockopt(s->server_fd.fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) 
< 0)
+    RPRINTF("Error setting REUSEADDR on %d: %d\n", s->server_fd.fd, errno);
+
+  if (bind(s->server_fd.fd, &s->sa, sizeof(s->sa)) < 0) {
+    RPRINTF("could not bind server socket %d to %s:%d: %d %s\n", 
s->server_fd.fd,
+           inet_ntoa(s->sa.sin_addr), ntohs(s->sa.sin_port), errno, 
strerror(errno));
+    if (errno != EADDRINUSE)
+      rc = -2;
+    goto err_sfd;
+  }
+  if (listen(s->server_fd.fd, 10)) {
+    RPRINTF("could not listen on socket: %d\n", errno);
+    goto err_sfd;
+  }
+
+  /* The socket s now bound to the address and listening so we may now register
+   * the fd with tapdisk */
+
+  if((s->server_fd.id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                                                      s->server_fd.fd, 0,
+                                                      remus_server_accept, s)) 
< 0) {
+    RPRINTF("error registering server connection event handler: %s",
+             strerror(s->server_fd.id));
+    goto err_sfd;
+  }
+
+  return 0;
+
+  err_sfd:
+  close(s->server_fd.fd);
+  s->server_fd.fd = -1;
+
+  return rc;
+}
+
+/* wait for latest checkpoint to be applied */
+static inline int server_writes_inflight(td_driver_t *driver)
+{
+  struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+  if (!s->ramdisk.inflight && !s->ramdisk.prev)
+    return 0;
+
+  return 1;
+}
+
+/* Due to block device prefetching this code may be called on the server side
+ * during normal replication. In this case we must return EBUSY, otherwise the
+ * domain may be started with stale data.
+ */
+void backup_queue_read(td_driver_t *driver, td_request_t treq)
+{
+  struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+  if(!remus_image)
+    remus_image = treq.image;
+
+#if 0
+  /* due to prefetching, we must return EBUSY on server reads. This
+   * maintains a consistent disk image */
+  td_complete_request(treq, -EBUSY);
+#else
+  /* what exactly is the race that requires the response above? */
+  td_forward_request(treq);
+#endif
+}
+
+/* see above */
+void backup_queue_write(td_driver_t *driver, td_request_t treq)
+{
+  struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+  /* on a server write, we know the domain has failed over. we must change our
+   * state to unprotected and then have the unprotected queue_write function
+   * handle the write
+   */
+
+  switch_mode(driver, mode_unprotected);
+  /* TODO: call the appropriate write function rather than return EBUSY */
+  td_complete_request(treq, -EBUSY);
+}
+
+static int backup_start(td_driver_t *driver)
+{
+  struct tdremus_state *s = (struct tdremus_state *)driver->data;
+  int fd;
+
+  if (ramdisk_start(driver) < 0)
+    return -1;
+  
+  tapdisk_remus.td_queue_read = backup_queue_read;
+  tapdisk_remus.td_queue_write = backup_queue_write;
+  /* TODO set flush function */
+  return 0;
+}
+
+static int server_do_wreq(td_driver_t *driver)
+{
+  struct tdremus_state *s = (struct tdremus_state *)driver->data;
+  static tdremus_wire_t twreq;
+  char buf[4096];
+  int len, rc;
+
+  char header[sizeof(uint32_t) + sizeof(uint64_t)];
+  uint32_t *sectors = (uint32_t *) header;
+  uint64_t *sector =  (uint64_t *) &header[sizeof(uint32_t)];
+
+  // RPRINTF("received write request\n");
+
+  if (mread(s->stream_fd.fd, header, sizeof(header)) < 0)
+    goto err;
+
+  len = *sectors * driver->info.sector_size;
+
+  //RPRINTF("writing %d sectors (%d bytes) starting at %" PRIu64 "\n", 
*sectors, len,
+  // *sector);
+
+  if (len > sizeof(buf)) {
+    /* freak out! */
+    RPRINTF("write request too large: %d/%u\n", len, (unsigned)sizeof(buf));
+    return -1;
+  }
+
+  if (mread(s->stream_fd.fd, buf, len) < 0)
+    goto err;
+
+  if (ramdisk_write(&s->ramdisk, *sector, *sectors, buf) < 0)
+      goto err;
+
+  return 0;
+
+  err:
+  /* should start failover */
+  RPRINTF("backup write request error\n");
+  close_stream_fd(s);
+
+  return -1;
+}
+
+static int server_do_sreq(td_driver_t *driver)
+{
+  /*
+  RPRINTF("submit request received\n");
+  */
+  
+  return 0;
+}
+
+/* at this point, the server can start applying the most recent
+ * ramdisk. */
+static int server_do_creq(td_driver_t *driver)
+{
+  struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+  // RPRINTF("committing buffer\n");
+
+  ramdisk_start_flush(driver);
+
+  /* XXX this message should not be sent until flush completes! */
+  if (write(s->stream_fd.fd, TDREMUS_DONE, strlen(TDREMUS_DONE)) != 4)
+    return -1;
+
+  return 0;
+}
+
+
+/* called when data is pending in s->rfd */
+static void remus_server_event(event_id_t id, char mode, void *private)
+{
+  struct tdremus_state *s = (struct tdremus_state *)private;
+  td_driver_t *driver = s->tdremus_driver;
+  char req[5];
+
+  // RPRINTF("replication data waiting\n");
+
+  /* TODO: add a get_connection_by_event_id() function.
+   * for now we can assume that the fd is s->stream_fd */
+
+  if (mread(s->stream_fd.fd, req, sizeof(req) - 1) < 0) {
+      RPRINTF("error reading server event, activating backup\n");
+      switch_mode(driver, mode_unprotected);
+      return;
+  }
+
+  req[4] = '\0';
+
+  if (!strcmp(req, TDREMUS_WRITE))
+    server_do_wreq(driver);
+  else if (!strcmp(req, TDREMUS_SUBMIT))
+    server_do_sreq(driver);
+  else if (!strcmp(req, TDREMUS_COMMIT))
+    server_do_creq(driver);
+  else
+    RPRINTF("unknown request received: %s\n", req);
+
+  return;
+
+}
+
+/* unprotected */
+
+void unprotected_queue_read(td_driver_t *driver, td_request_t treq)
+{
+  struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+  /* wait for previous ramdisk to flush  before servicing reads */
+  if (server_writes_inflight(driver)) {
+    /* for now lets just return EBUSY. if this becomes an issue we can
+     * do something smarter */
+    td_complete_request(treq, -EBUSY);
+  }
+  else {
+    /* here we just pass reads through */
+    td_forward_request(treq);
+  }
+}
+
+/* For a recoverable remus solution we need to log unprotected writes here */
+void unprotected_queue_write(td_driver_t *driver, td_request_t treq)
+{
+  struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+  /* wait for previous ramdisk to flush */
+  if (server_writes_inflight(driver)) {
+    RPRINTF("queue_write: waiting for queue to drain");
+    td_complete_request(treq, -EBUSY);
+  }
+  else {
+      // RPRINTF("servicing write request on backup\n");
+      td_forward_request(treq);
+  }
+}
+
+static int unprotected_start(td_driver_t *driver)
+{
+  struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+  RPRINTF("failure detected, activating passthrough\n");
+
+  /* close the server socket */
+  close_stream_fd(s);
+
+  /* unregister the replication stream */
+  tapdisk_server_unregister_event(s->server_fd.id);
+
+  /* close the replication stream */
+  close(s->server_fd.fd);
+  s->server_fd.fd = -1;
+
+  /* install the unprotected read/write handlers */
+  tapdisk_remus.td_queue_read = unprotected_queue_read;
+  tapdisk_remus.td_queue_write = unprotected_queue_write;
+
+  return 0;
+}
+
+
+/* control */
+
+static inline int resolve_address(const char* addr, struct in_addr* ia)
+{
+  struct hostent* he;
+  uint32_t ip;
+
+  if (!(he = gethostbyname(addr))) {
+    RPRINTF("error resolving %s: %d\n", addr, h_errno);
+    return -1;
+  }
+
+  if (!he->h_addr_list[0]) {
+    RPRINTF("no address found for %s\n", addr);
+    return -1;
+  }
+
+  /* network byte order */
+  ip = *((uint32_t**)he->h_addr_list)[0];
+  ia->s_addr = ip;
+
+  return 0;
+}
+
+static int get_args(td_driver_t *driver, const char* name)
+{
+  struct tdremus_state *state = (struct tdremus_state *)driver->data;
+  char* host;
+  char* port;
+//  char* driver_str;
+//  char* parent;
+//  int type;
+//  char* path;
+//  unsigned long ulport;
+//  int i;
+//  struct sockaddr_in server_addr_in;
+
+  int gai_status;
+  int valid_addr;
+  struct addrinfo gai_hints;
+  struct addrinfo *servinfo, *servinfo_itr;
+
+  memset(&gai_hints, 0, sizeof gai_hints);
+  gai_hints.ai_family = AF_UNSPEC;
+  gai_hints.ai_socktype = SOCK_STREAM;
+
+  port = strchr(name, ':');
+  if (!port) {
+    RPRINTF("missing host in %s\n", name);
+    return -ENOENT;
+  }
+  if (!(host = strndup(name, port - name))) {
+    RPRINTF("unable to allocate host\n");
+    return -ENOMEM;
+  }
+  port++;
+
+  if ((gai_status = getaddrinfo(host, port, &gai_hints, &servinfo)) != 0) {
+    RPRINTF("getaddrinfo error: %s\n", gai_strerror(gai_status));
+    return -ENOENT;
+  }
+
+  /* TODO: do something smarter here */
+  valid_addr = 0;
+  for(servinfo_itr = servinfo; servinfo_itr != NULL; servinfo_itr = 
servinfo_itr->ai_next) {
+    void *addr;
+    char *ipver;
+
+    if (servinfo_itr->ai_family == AF_INET) {
+      valid_addr = 1;
+      memset(&state->sa, 0, sizeof(state->sa));
+      state->sa = *(struct sockaddr_in *)servinfo_itr->ai_addr;
+      break;
+    }
+  }
+  freeaddrinfo(servinfo); 
+
+  if (!valid_addr)
+    return -ENOENT;
+
+  RPRINTF("host: %s, port: %d\n", inet_ntoa(state->sa.sin_addr), 
ntohs(state->sa.sin_port));
+
+  return 0;
+}
+
+static int switch_mode(td_driver_t *driver, enum tdremus_mode mode)
+{
+  struct tdremus_state *s = (struct tdremus_state *)driver->data;
+  int rc;
+
+  if (mode == s->mode)
+    return 0;
+
+  if (s->queue_flush)
+      if ((rc = s->queue_flush(driver)) < 0) {
+         // fall back to unprotected mode on error
+         RPRINTF("switch_mode: error flushing queue (old: %d, new: %d)", 
s->mode, mode);
+         mode = mode_unprotected;
+      }
+
+  if (mode == mode_unprotected)
+    rc = unprotected_start(driver);
+  else if (mode == mode_primary)
+    rc = primary_start(driver);
+  else if (mode == mode_backup)
+    rc = backup_start(driver);
+  else {
+    RPRINTF("unknown mode requested: %d\n", mode);
+    rc = -1;
+  }
+
+  if (!rc)
+    s->mode = mode;
+
+  return rc;
+}
+
+static void ctl_request(event_id_t id, char mode, void *private)
+{
+  struct tdremus_state *s = (struct tdremus_state *)private;
+  td_driver_t *driver = s->tdremus_driver;
+  char msg[80];
+  int rc;
+
+  // RPRINTF("data waiting on control fifo\n");
+
+  if (!(rc = read(s->ctl_fd.fd, msg, sizeof(msg) - 1 /* append nul */))) {
+    RPRINTF("0-byte read received, reopening FIFO\n");
+    /*TODO: we may have to unregister/re-register with tapdisk_server */
+    close(s->ctl_fd.fd);
+    RPRINTF("FIFO closed\n");
+    if ((s->ctl_fd.fd = open(s->ctl_path, O_RDWR)) < 0) {
+       RPRINTF("error reopening FIFO: %d\n", errno);
+    }
+    return;
+  }
+
+  if (rc < 0) {
+    RPRINTF("error reading from FIFO: %d\n", errno);
+    return;
+  }
+
+  /* TODO: need to get driver somehow */
+  msg[rc] = '\0';
+  if (!strncmp(msg, "flush", 5)) {
+    if (s->queue_flush)
+       if ((rc = s->queue_flush(driver))) {
+           RPRINTF("error passing flush request to backup");
+           ctl_respond(s, TDREMUS_FAIL);
+       }
+  } else {
+    RPRINTF("unknown command: %s\n", msg);
+  }
+}
+
+static int ctl_respond(struct tdremus_state *s, const char *response)
+{
+    int rc;
+
+    if ((rc = write(s->msg_fd.fd, response, strlen(response))) < 0) {
+       RPRINTF("error writing notification: %d\n", errno);
+       close(s->msg_fd.fd);
+       if ((s->msg_fd.fd = open(s->msg_path, O_RDWR)) < 0)
+           RPRINTF("error reopening FIFO: %d\n", errno);
+    }
+
+    return rc;
+}
+
+/* must be called after the underlying driver has been initialized */
+static int ctl_open(td_driver_t *driver, const char* name)
+{
+  struct tdremus_state *s = (struct tdremus_state *)driver->data;
+  int i, l;
+
+  /* first we must ensure that BLKTAP_CTRL_DIR exists */
+  if (mkdir(BLKTAP_CTRL_DIR, 0755) && errno != EEXIST)
+  {
+    DPRINTF("error creating directory %s: %d\n", BLKTAP_CTRL_DIR, errno);
+    return -1;
+  }
+
+  /* use the device name to create the control fifo path */
+  if (asprintf(&s->ctl_path, BLKTAP_CTRL_DIR "/remus_%s", name) < 0)
+    return -1;
+  /* scrub fifo pathname  */
+  for (i = strlen(BLKTAP_CTRL_DIR) + 1, l = strlen(s->ctl_path); i < l; i++) {
+    if (strchr(":/", s->ctl_path[i]))
+      s->ctl_path[i] = '_';
+  }
+  if (asprintf(&s->msg_path, "%s.msg", s->ctl_path) < 0)
+    goto err_ctlfifo;
+
+  if (mkfifo(s->ctl_path, S_IRWXU|S_IRWXG|S_IRWXO) && errno != EEXIST) {
+    RPRINTF("error creating control FIFO %s: %d\n", s->ctl_path, errno);
+    goto err_msgfifo;
+  }
+
+  if (mkfifo(s->msg_path, S_IRWXU|S_IRWXG|S_IRWXO) && errno != EEXIST) {
+    RPRINTF("error creating message FIFO %s: %d\n", s->msg_path, errno);
+    goto err_msgfifo;
+  }
+
+  /* RDWR so that fd doesn't block select when no writer is present */
+  if ((s->ctl_fd.fd = open(s->ctl_path, O_RDWR)) < 0) {
+    RPRINTF("error opening control FIFO %s: %d\n", s->ctl_path, errno);
+    goto err_msgfifo;
+  }
+
+  if ((s->msg_fd.fd = open(s->msg_path, O_RDWR)) < 0) {
+    RPRINTF("error opening message FIFO %s: %d\n", s->msg_path, errno);
+    goto err_openctlfifo;
+  }
+
+  RPRINTF("control FIFO %s\n", s->ctl_path);
+  RPRINTF("message FIFO %s\n", s->msg_path);
+
+  return 0;
+
+  err_openctlfifo:
+  close(s->ctl_fd.fd);
+  err_msgfifo:
+  free(s->msg_path);
+  s->msg_path = NULL;
+  err_ctlfifo:
+  free(s->ctl_path);
+  s->ctl_path = NULL;
+  return -1;
+}
+
+static void ctl_close(td_driver_t *driver)
+{
+  struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+  /* TODO: close *all* connections */
+
+  if(s->ctl_fd.fd)
+    close(s->ctl_fd.fd);
+
+  if (s->ctl_path) {
+    unlink(s->ctl_path);
+    free(s->ctl_path);
+    s->ctl_path = NULL;
+  }
+  if (s->msg_path) {
+    unlink(s->msg_path);
+    free(s->msg_path);
+    s->msg_path = NULL;
+  }
+}
+
+static int ctl_register(struct tdremus_state *s)
+{
+  RPRINTF("registering ctl fifo\n");
+
+  /* register ctl fd */
+  s->ctl_fd.id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, 
s->ctl_fd.fd, 0, ctl_request, s);
+  
+  if (s->ctl_fd.id < 0) {
+    RPRINTF("error registering ctrl FIFO %s: %d\n", s->ctl_path, s->ctl_fd.id);
+    return -1;
+  }
+
+  return 0;
+}
+
+/* interface */
+
+static int tdremus_open(td_driver_t *driver, const char *name,
+                         td_flag_t flags)
+{
+  struct tdremus_state *s = (struct tdremus_state *)driver->data;
+  int rc;
+
+  RPRINTF("opening %s\n", name);
+
+  /* first we need to get the underlying vbd for this driver stack. To do so we
+   * need to know the vbd's id. Fortunately, for tapdisk2 this is hard-coded as
+   * 0 (see tapdisk2.c)
+   */
+  device_vbd = tapdisk_server_get_vbd(0);
+
+  memset(s, 0, sizeof(*s));
+  s->server_fd.fd = -1;
+  s->stream_fd.fd = -1;
+  s->ctl_fd.fd = -1;
+  s->msg_fd.fd = -1;
+
+  /* TODO: this is only needed so that the server can send writes down
+   * the driver stack from the stream_fd event handler */
+  s->tdremus_driver = driver;
+
+  /* parse name to get info etc */
+  if ((rc = get_args(driver, name)))
+    return rc;
+
+  if ((rc = ctl_open(driver, name))) {
+    RPRINTF("error setting up control channel\n");
+    free(s->driver_data);
+    return rc;
+  }
+
+  if ((rc = ctl_register(s))) {
+    RPRINTF("error registering control channel\n");
+    free(s->driver_data);
+    return rc;
+  }
+
+  if (!(rc = remus_bind(s)))
+      rc = switch_mode(driver, mode_backup);
+  else if (rc == -2)
+      rc = switch_mode(driver, mode_primary);
+
+  if (!rc)
+      return 0;
+
+  tdremus_close(driver);
+  return -EIO;
+}
+
+static int tdremus_close(td_driver_t *driver)
+{
+  struct tdremus_state *s = (struct tdremus_state *)driver->data;
+  int rc;
+
+  RPRINTF("closing\n");
+
+  if (s->driver_data) {
+    free(s->driver_data);
+    s->driver_data = NULL;
+  }
+  if (s->server_fd.fd >= 0) {
+    close(s->server_fd.fd);
+    s->server_fd.fd = -1;
+  }
+  if (s->stream_fd.fd >= 0)
+    close_stream_fd(s);
+
+  ctl_close(driver);
+
+  return rc;
+}
+
+static int tdremus_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+  /* we shouldn't have a parent... for now */
+  return -EINVAL;
+}
+
+static int tdremus_validate_parent(td_driver_t *driver, 
+                            td_driver_t *pdriver, td_flag_t flags)
+{
+  return 0;
+}
+
+struct tap_disk tapdisk_remus = {
+  .disk_type          = "tapdisk_remus",
+  .private_data_size  = sizeof(struct tdremus_state),
+  .td_open            = tdremus_open,
+  .td_queue_read      = unprotected_queue_read,
+  .td_queue_write     = unprotected_queue_write,
+  .td_close           = tdremus_close,
+  .td_get_parent_id   = tdremus_get_parent_id,
+  .td_validate_parent = tdremus_validate_parent,
+  .td_debug           = NULL,
+};
diff --git a/tools/blktap2/drivers/disktypes.h 
b/tools/blktap2/drivers/disktypes.h
--- a/tools/blktap2/drivers/disktypes.h
+++ b/tools/blktap2/drivers/disktypes.h
@@ -49,6 +49,7 @@
  extern struct tap_disk tapdisk_qcow; 
 extern struct tap_disk tapdisk_block_cache;
 extern struct tap_disk tapdisk_log;
+extern struct tap_disk tapdisk_remus;
 
 #define MAX_DISK_TYPES        20
 
@@ -61,6 +62,7 @@
 #define DISK_TYPE_QCOW        6
 #define DISK_TYPE_BLOCK_CACHE 7
 #define DISK_TYPE_LOG         9
+#define DISK_TYPE_REMUS       10
 
 /*Define Individual Disk Parameters here */
 static disk_info_t null_disk = {
@@ -167,6 +169,16 @@
 #endif
 };
 
+static disk_info_t remus_disk = {
+       DISK_TYPE_REMUS,
+       "remus disk replicator (remus)",
+       "remus",
+       0,
+#ifdef TAPDISK
+       &tapdisk_remus,
+#endif
+};
+
 /*Main disk info array */
 static disk_info_t *dtypes[] = {
        &aio_disk,
@@ -179,6 +191,7 @@
        &block_cache_disk,
        &null_disk,
        &log_disk,
+       &remus_disk,
 };
 
 #endif
diff --git a/tools/blktap2/drivers/hashtable.c 
b/tools/blktap2/drivers/hashtable.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap2/drivers/hashtable.c
@@ -0,0 +1,274 @@
+/* Copyright (C) 2004 Christopher Clark <firstname.lastname@xxxxxxxxxxxx> */
+
+#include "hashtable.h"
+#include "hashtable_private.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+/*
+Credit for primes table: Aaron Krowne
+ http://br.endernet.org/~akrowne/
+ http://planetmath.org/encyclopedia/GoodHashTablePrimes.html
+*/
+static const unsigned int primes[] = {
+53, 97, 193, 389,
+769, 1543, 3079, 6151,
+12289, 24593, 49157, 98317,
+196613, 393241, 786433, 1572869,
+3145739, 6291469, 12582917, 25165843,
+50331653, 100663319, 201326611, 402653189,
+805306457, 1610612741
+};
+const unsigned int prime_table_length = sizeof(primes)/sizeof(primes[0]);
+const float max_load_factor = 0.65;
+
+/*****************************************************************************/
+struct hashtable *
+create_hashtable(unsigned int minsize,
+                 unsigned int (*hashf) (void*),
+                 int (*eqf) (void*,void*))
+{
+    struct hashtable *h;
+    unsigned int pindex, size = primes[0];
+    /* Check requested hashtable isn't too large */
+    if (minsize > (1u << 30)) return NULL;
+    /* Enforce size as prime */
+    for (pindex=0; pindex < prime_table_length; pindex++) {
+        if (primes[pindex] > minsize) { size = primes[pindex]; break; }
+    }
+    h = (struct hashtable *)malloc(sizeof(struct hashtable));
+    if (NULL == h) return NULL; /*oom*/
+    h->table = (struct entry **)malloc(sizeof(struct entry*) * size);
+    if (NULL == h->table) { free(h); return NULL; } /*oom*/
+    memset(h->table, 0, size * sizeof(struct entry *));
+    h->tablelength  = size;
+    h->primeindex   = pindex;
+    h->entrycount   = 0;
+    h->hashfn       = hashf;
+    h->eqfn         = eqf;
+    h->loadlimit    = (unsigned int) ceil(size * max_load_factor);
+    return h;
+}
+
+/*****************************************************************************/
+unsigned int
+hash(struct hashtable *h, void *k)
+{
+    /* Aim to protect against poor hash functions by adding logic here
+     * - logic taken from java 1.4 hashtable source */
+    unsigned int i = h->hashfn(k);
+    i += ~(i << 9);
+    i ^=  ((i >> 14) | (i << 18)); /* >>> */
+    i +=  (i << 4);
+    i ^=  ((i >> 10) | (i << 22)); /* >>> */
+    return i;
+}
+
+/*****************************************************************************/
+static int
+hashtable_expand(struct hashtable *h)
+{
+    /* Double the size of the table to accomodate more entries */
+    struct entry **newtable;
+    struct entry *e;
+    struct entry **pE;
+    unsigned int newsize, i, index;
+    /* Check we're not hitting max capacity */
+    if (h->primeindex == (prime_table_length - 1)) return 0;
+    newsize = primes[++(h->primeindex)];
+
+    newtable = (struct entry **)malloc(sizeof(struct entry*) * newsize);
+    if (NULL != newtable)
+    {
+        memset(newtable, 0, newsize * sizeof(struct entry *));
+        /* This algorithm is not 'stable'. ie. it reverses the list
+         * when it transfers entries between the tables */
+        for (i = 0; i < h->tablelength; i++) {
+            while (NULL != (e = h->table[i])) {
+                h->table[i] = e->next;
+                index = indexFor(newsize,e->h);
+                e->next = newtable[index];
+                newtable[index] = e;
+            }
+        }
+        free(h->table);
+        h->table = newtable;
+    }
+    /* Plan B: realloc instead */
+    else 
+    {
+        newtable = (struct entry **)
+                   realloc(h->table, newsize * sizeof(struct entry *));
+        if (NULL == newtable) { (h->primeindex)--; return 0; }
+        h->table = newtable;
+        memset(newtable[h->tablelength], 0, newsize - h->tablelength);
+        for (i = 0; i < h->tablelength; i++) {
+            for (pE = &(newtable[i]), e = *pE; e != NULL; e = *pE) {
+                index = indexFor(newsize,e->h);
+                if (index == i)
+                {
+                    pE = &(e->next);
+                }
+                else
+                {
+                    *pE = e->next;
+                    e->next = newtable[index];
+                    newtable[index] = e;
+                }
+            }
+        }
+    }
+    h->tablelength = newsize;
+    h->loadlimit   = (unsigned int) ceil(newsize * max_load_factor);
+    return -1;
+}
+
+/*****************************************************************************/
+unsigned int
+hashtable_count(struct hashtable *h)
+{
+    return h->entrycount;
+}
+
+/*****************************************************************************/
+int
+hashtable_insert(struct hashtable *h, void *k, void *v)
+{
+    /* This method allows duplicate keys - but they shouldn't be used */
+    unsigned int index;
+    struct entry *e;
+    if (++(h->entrycount) > h->loadlimit)
+    {
+        /* Ignore the return value. If expand fails, we should
+         * still try cramming just this value into the existing table
+         * -- we may not have memory for a larger table, but one more
+         * element may be ok. Next time we insert, we'll try expanding again.*/
+        hashtable_expand(h);
+    }
+    e = (struct entry *)malloc(sizeof(struct entry));
+    if (NULL == e) { --(h->entrycount); return 0; } /*oom*/
+    e->h = hash(h,k);
+    index = indexFor(h->tablelength,e->h);
+    e->k = k;
+    e->v = v;
+    e->next = h->table[index];
+    h->table[index] = e;
+    return -1;
+}
+
+/*****************************************************************************/
+void * /* returns value associated with key */
+hashtable_search(struct hashtable *h, void *k)
+{
+    struct entry *e;
+    unsigned int hashvalue, index;
+    hashvalue = hash(h,k);
+    index = indexFor(h->tablelength,hashvalue);
+    e = h->table[index];
+    while (NULL != e)
+    {
+        /* Check hash value to short circuit heavier comparison */
+        if ((hashvalue == e->h) && (h->eqfn(k, e->k))) return e->v;
+        e = e->next;
+    }
+    return NULL;
+}
+
+/*****************************************************************************/
+void * /* returns value associated with key */
+hashtable_remove(struct hashtable *h, void *k)
+{
+    /* TODO: consider compacting the table when the load factor drops enough,
+     *       or provide a 'compact' method. */
+
+    struct entry *e;
+    struct entry **pE;
+    void *v;
+    unsigned int hashvalue, index;
+
+    hashvalue = hash(h,k);
+    index = indexFor(h->tablelength,hash(h,k));
+    pE = &(h->table[index]);
+    e = *pE;
+    while (NULL != e)
+    {
+        /* Check hash value to short circuit heavier comparison */
+        if ((hashvalue == e->h) && (h->eqfn(k, e->k)))
+        {
+            *pE = e->next;
+            h->entrycount--;
+            v = e->v;
+            freekey(e->k);
+            free(e);
+            return v;
+        }
+        pE = &(e->next);
+        e = e->next;
+    }
+    return NULL;
+}
+
+/*****************************************************************************/
+/* destroy */
+void
+hashtable_destroy(struct hashtable *h, int free_values)
+{
+    unsigned int i;
+    struct entry *e, *f;
+    struct entry **table = h->table;
+    if (free_values)
+    {
+        for (i = 0; i < h->tablelength; i++)
+        {
+            e = table[i];
+            while (NULL != e)
+            { f = e; e = e->next; freekey(f->k); free(f->v); free(f); }
+        }
+    }
+    else
+    {
+        for (i = 0; i < h->tablelength; i++)
+        {
+            e = table[i];
+            while (NULL != e)
+            { f = e; e = e->next; freekey(f->k); free(f); }
+        }
+    }
+    free(h->table);
+    free(h);
+}
+
+/*
+ * Copyright (c) 2002, Christopher Clark
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ * 
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
diff --git a/tools/blktap2/drivers/hashtable_itr.c 
b/tools/blktap2/drivers/hashtable_itr.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap2/drivers/hashtable_itr.c
@@ -0,0 +1,188 @@
+/* Copyright (C) 2002, 2004 Christopher Clark  
<firstname.lastname@xxxxxxxxxxxx> */
+
+#include "hashtable.h"
+#include "hashtable_private.h"
+#include "hashtable_itr.h"
+#include <stdlib.h> /* defines NULL */
+
+/*****************************************************************************/
+/* hashtable_iterator    - iterator constructor */
+
+struct hashtable_itr *
+hashtable_iterator(struct hashtable *h)
+{
+    unsigned int i, tablelength;
+    struct hashtable_itr *itr = (struct hashtable_itr *)
+        malloc(sizeof(struct hashtable_itr));
+    if (NULL == itr) return NULL;
+    itr->h = h;
+    itr->e = NULL;
+    itr->parent = NULL;
+    tablelength = h->tablelength;
+    itr->index = tablelength;
+    if (0 == h->entrycount) return itr;
+
+    for (i = 0; i < tablelength; i++)
+    {
+        if (NULL != h->table[i])
+        {
+            itr->e = h->table[i];
+            itr->index = i;
+            break;
+        }
+    }
+    return itr;
+}
+
+/*****************************************************************************/
+/* key      - return the key of the (key,value) pair at the current position */
+/* value    - return the value of the (key,value) pair at the current position 
*/
+
+void *
+hashtable_iterator_key(struct hashtable_itr *i)
+{ return i->e->k; }
+
+void *
+hashtable_iterator_value(struct hashtable_itr *i)
+{ return i->e->v; }
+
+/*****************************************************************************/
+/* advance - advance the iterator to the next element
+ *           returns zero if advanced to end of table */
+
+int
+hashtable_iterator_advance(struct hashtable_itr *itr)
+{
+    unsigned int j,tablelength;
+    struct entry **table;
+    struct entry *next;
+    if (NULL == itr->e) return 0; /* stupidity check */
+
+    next = itr->e->next;
+    if (NULL != next)
+    {
+        itr->parent = itr->e;
+        itr->e = next;
+        return -1;
+    }
+    tablelength = itr->h->tablelength;
+    itr->parent = NULL;
+    if (tablelength <= (j = ++(itr->index)))
+    {
+        itr->e = NULL;
+        return 0;
+    }
+    table = itr->h->table;
+    while (NULL == (next = table[j]))
+    {
+        if (++j >= tablelength)
+        {
+            itr->index = tablelength;
+            itr->e = NULL;
+            return 0;
+        }
+    }
+    itr->index = j;
+    itr->e = next;
+    return -1;
+}
+
+/*****************************************************************************/
+/* remove - remove the entry at the current iterator position
+ *          and advance the iterator, if there is a successive
+ *          element.
+ *          If you want the value, read it before you remove:
+ *          beware memory leaks if you don't.
+ *          Returns zero if end of iteration. */
+
+int
+hashtable_iterator_remove(struct hashtable_itr *itr)
+{
+    struct entry *remember_e, *remember_parent;
+    int ret;
+
+    /* Do the removal */
+    if (NULL == (itr->parent))
+    {
+        /* element is head of a chain */
+        itr->h->table[itr->index] = itr->e->next;
+    } else {
+        /* element is mid-chain */
+        itr->parent->next = itr->e->next;
+    }
+    /* itr->e is now outside the hashtable */
+    remember_e = itr->e;
+    itr->h->entrycount--;
+    freekey(remember_e->k);
+
+    /* Advance the iterator, correcting the parent */
+    remember_parent = itr->parent;
+    ret = hashtable_iterator_advance(itr);
+    if (itr->parent == remember_e) { itr->parent = remember_parent; }
+    free(remember_e);
+    return ret;
+}
+
+/*****************************************************************************/
+int /* returns zero if not found */
+hashtable_iterator_search(struct hashtable_itr *itr,
+                          struct hashtable *h, void *k)
+{
+    struct entry *e, *parent;
+    unsigned int hashvalue, index;
+
+    hashvalue = hash(h,k);
+    index = indexFor(h->tablelength,hashvalue);
+
+    e = h->table[index];
+    parent = NULL;
+    while (NULL != e)
+    {
+        /* Check hash value to short circuit heavier comparison */
+        if ((hashvalue == e->h) && (h->eqfn(k, e->k)))
+        {
+            itr->index = index;
+            itr->e = e;
+            itr->parent = parent;
+            itr->h = h;
+            return -1;
+        }
+        parent = e;
+        e = e->next;
+    }
+    return 0;
+}
+
+
+/*
+ * Copyright (c) 2002, 2004, Christopher Clark
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ * 
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
diff --git a/tools/blktap2/drivers/hashtable_itr.h 
b/tools/blktap2/drivers/hashtable_itr.h
new file mode 100644
--- /dev/null
+++ b/tools/blktap2/drivers/hashtable_itr.h
@@ -0,0 +1,112 @@
+/* Copyright (C) 2002, 2004 Christopher Clark 
<firstname.lastname@xxxxxxxxxxxx> */
+
+#ifndef __HASHTABLE_ITR_CWC22__
+#define __HASHTABLE_ITR_CWC22__
+#include "hashtable.h"
+#include "hashtable_private.h" /* needed to enable inlining */
+
+/*****************************************************************************/
+/* This struct is only concrete here to allow the inlining of two of the
+ * accessor functions. */
+struct hashtable_itr
+{
+    struct hashtable *h;
+    struct entry *e;
+    struct entry *parent;
+    unsigned int index;
+};
+
+
+/*****************************************************************************/
+/* hashtable_iterator
+ */
+
+struct hashtable_itr *
+hashtable_iterator(struct hashtable *h);
+
+/*****************************************************************************/
+/* hashtable_iterator_key
+ * - return the value of the (key,value) pair at the current position */
+
+extern inline void *
+hashtable_iterator_key(struct hashtable_itr *i)
+{
+    return i->e->k;
+}
+
+/*****************************************************************************/
+/* value - return the value of the (key,value) pair at the current position */
+
+extern inline void *
+hashtable_iterator_value(struct hashtable_itr *i)
+{
+    return i->e->v;
+}
+
+/*****************************************************************************/
+/* advance - advance the iterator to the next element
+ *           returns zero if advanced to end of table */
+
+int
+hashtable_iterator_advance(struct hashtable_itr *itr);
+
+/*****************************************************************************/
+/* remove - remove current element and advance the iterator to the next element
+ *          NB: if you need the value to free it, read it before
+ *          removing. ie: beware memory leaks!
+ *          returns zero if advanced to end of table */
+
+int
+hashtable_iterator_remove(struct hashtable_itr *itr);
+
+/*****************************************************************************/
+/* search - overwrite the supplied iterator, to point to the entry
+ *          matching the supplied key.
+            h points to the hashtable to be searched.
+ *          returns zero if not found. */
+int
+hashtable_iterator_search(struct hashtable_itr *itr,
+                          struct hashtable *h, void *k);
+
+#define DEFINE_HASHTABLE_ITERATOR_SEARCH(fnname, keytype) \
+int fnname (struct hashtable_itr *i, struct hashtable *h, keytype *k) \
+{ \
+    return (hashtable_iterator_search(i,h,k)); \
+}
+
+
+
+#endif /* __HASHTABLE_ITR_CWC22__*/
+
+/*
+ * Copyright (c) 2002, 2004, Christopher Clark
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ * 
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
diff --git a/tools/blktap2/drivers/hashtable_utility.c 
b/tools/blktap2/drivers/hashtable_utility.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap2/drivers/hashtable_utility.c
@@ -0,0 +1,71 @@
+/* Copyright (C) 2002 Christopher Clark <firstname.lastname@xxxxxxxxxxxx> */
+
+#include "hashtable.h"
+#include "hashtable_private.h"
+#include "hashtable_utility.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+/*****************************************************************************/
+/* hashtable_change
+ *
+ * function to change the value associated with a key, where there already
+ * exists a value bound to the key in the hashtable.
+ * Source due to Holger Schemel.
+ * 
+ *  */
+int
+hashtable_change(struct hashtable *h, void *k, void *v)
+{
+    struct entry *e;
+    unsigned int hashvalue, index;
+    hashvalue = hash(h,k);
+    index = indexFor(h->tablelength,hashvalue);
+    e = h->table[index];
+    while (NULL != e)
+    {
+        /* Check hash value to short circuit heavier comparison */
+        if ((hashvalue == e->h) && (h->eqfn(k, e->k)))
+        {
+            free(e->v);
+            e->v = v;
+            return -1;
+        }
+        e = e->next;
+    }
+    return 0;
+}
+
+/*
+ * Copyright (c) 2002, Christopher Clark
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ * 
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
diff --git a/tools/blktap2/drivers/hashtable_utility.h 
b/tools/blktap2/drivers/hashtable_utility.h
new file mode 100644
--- /dev/null
+++ b/tools/blktap2/drivers/hashtable_utility.h
@@ -0,0 +1,55 @@
+/* Copyright (C) 2002 Christopher Clark <firstname.lastname@xxxxxxxxxxxx> */
+
+#ifndef __HASHTABLE_CWC22_UTILITY_H__
+#define __HASHTABLE_CWC22_UTILITY_H__
+
+/*****************************************************************************
+ * hashtable_change
+ *
+ * function to change the value associated with a key, where there already
+ * exists a value bound to the key in the hashtable.
+ * Source due to Holger Schemel.
+ *
+ * @name        hashtable_change
+ * @param   h   the hashtable
+ * @param       key
+ * @param       value
+ *
+ */
+int
+hashtable_change(struct hashtable *h, void *k, void *v);
+
+#endif /* __HASHTABLE_CWC22_H__ */
+
+/*
+ * Copyright (c) 2002, Christopher Clark
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ * 
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
diff --git a/tools/python/xen/xend/server/BlktapController.py 
b/tools/python/xen/xend/server/BlktapController.py
--- a/tools/python/xen/xend/server/BlktapController.py
+++ b/tools/python/xen/xend/server/BlktapController.py
@@ -28,6 +28,7 @@
     'ram',
     'qcow',
     'vhd',
+    'remus',
     ]
 
 blktap_disk_types = blktap1_disk_types + blktap2_disk_types

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
WARNING - OLD ARCHIVES

xen-devel

[Xen-devel] [PATCH 11 of 11] blktap2: add remus driver