WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] Manual merge.

# HG changeset patch
# User kaf24@xxxxxxxxxxxxxxxxxxxx
# Node ID f8acd354e1295226fbda14aaf8bd164e07b93742
# Parent  80d5dd14711eccf379e475000f3b156df286d279

# Parent  09067ce923038c4ba6dcb9630fb848cce0d1c5fa
Manual merge.

diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/Makefile
--- a/tools/blktap/Makefile     Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/Makefile     Sun Jul  3 22:36:48 2005
@@ -2,43 +2,24 @@
 MINOR    = 0
 SONAME   = libblktap.so.$(MAJOR)
 
-CC       = gcc
-
 XEN_ROOT = ../..
 include $(XEN_ROOT)/tools/Rules.mk
 
-BLKTAP_INSTALL_DIR     = /usr/sbin
+SUBDIRS :=
+SUBDIRS += parallax
 
-INSTALL         = install
-INSTALL_PROG    = $(INSTALL) -m0755
-INSTALL_DIR     = $(INSTALL) -d -m0755
+BLKTAP_INSTALL_DIR = /usr/sbin
 
-INCLUDES += 
+INSTALL            = install
+INSTALL_PROG       = $(INSTALL) -m0755
+INSTALL_DIR        = $(INSTALL) -d -m0755
+
+INCLUDES += -I. -I $(XEN_LIBXC)
 
 LIBS     := -lpthread -lz
 
 SRCS     :=
 SRCS     += blktaplib.c
-
-PLX_SRCS := 
-PLX_SRCS += vdi.c 
-PLX_SRCS += radix.c 
-PLX_SRCS += snaplog.c
-PLX_SRCS += blockstore.c 
-PLX_SRCS += block-async.c
-PLX_SRCS += requests-async.c
-VDI_SRCS := $(PLX_SRCS)
-PLX_SRCS += parallax.c
-
-VDI_TOOLS :=
-VDI_TOOLS += vdi_create
-VDI_TOOLS += vdi_list
-VDI_TOOLS += vdi_snap
-VDI_TOOLS += vdi_snap_list
-VDI_TOOLS += vdi_snap_delete
-VDI_TOOLS += vdi_fill
-VDI_TOOLS += vdi_tree
-VDI_TOOLS += vdi_validate
 
 CFLAGS   += -Wall
 CFLAGS   += -Werror
@@ -46,20 +27,21 @@
 #CFLAGS   += -O3
 CFLAGS   += -g3
 CFLAGS   += -fno-strict-aliasing
-CFLAGS   += -I $(XEN_LIBXC)
-CFLAGS   += $(INCLUDES) -I.
 CFLAGS   += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
 # Get gcc to generate the dependencies for us.
 CFLAGS   += -Wp,-MD,.$(@F).d
+CFLAGS   += $(INCLUDES) 
 DEPS     = .*.d
 
 OBJS     = $(patsubst %.c,%.o,$(SRCS))
-IBINS    = blkdump parallax $(VDI_TOOLS)
+IBINS    = blkdump
 
 LIB      = libblktap.so libblktap.so.$(MAJOR) libblktap.so.$(MAJOR).$(MINOR)
 
-all: mk-symlinks blkdump $(VDI_TOOLS) parallax blockstored
-       $(MAKE) $(LIB)
+all: mk-symlinks libblktap.so blkdump
+       @set -e; for subdir in $(SUBDIRS); do \
+               $(MAKE) -C $$subdir $@;       \
+       done
 
 LINUX_ROOT := $(wildcard $(XEN_ROOT)/linux-2.6.*-xen-sparse)
 mk-symlinks:
@@ -77,10 +59,16 @@
        $(INSTALL_DIR) -p $(DESTDIR)/usr/include
        $(INSTALL_PROG) $(LIB) $(DESTDIR)/usr/$(LIBDIR)
        $(INSTALL_PROG) blktaplib.h $(DESTDIR)/usr/include
-       $(INSTALL_PROG) $(IBINS) $(DESTDIR)/$(BLKTAP_INSTALL_DIR)
+       $(INSTALL_PROG) $(IBINS) $(DESTDIR)$(BLKTAP_INSTALL_DIR)
+       @set -e; for subdir in $(SUBDIRS); do \
+               $(MAKE) -C $$subdir $@;       \
+       done
 
 clean:
-       rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS blkdump 
$(VDI_TOOLS) parallax vdi_unittest
+       rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS blkdump
+       @set -e; for subdir in $(SUBDIRS); do \
+               $(MAKE) -C $$subdir $@;       \
+       done
 
 rpm: all
        rm -rf staging
@@ -91,52 +79,17 @@
        mv staging/i386/*.rpm .
        rm -rf staging
 
-libblktap.so:
+libblktap.so: $(OBJS)
+       $(CC) $(CFLAGS) -Wl,-soname -Wl,$(SONAME) -shared -o      \
+             libblktap.so.$(MAJOR).$(MINOR) $^ $(LIBS)
+       ln -sf libblktap.so.$(MAJOR).$(MINOR) libblktap.so.$(MAJOR)
        ln -sf libblktap.so.$(MAJOR) $@
-libblktap.so.$(MAJOR):
-       ln -sf libblktap.so.$(MAJOR).$(MINOR) $@
-libblktap.so.$(MAJOR).$(MINOR): $(OBJS)
-       $(CC) -Wl,-soname -Wl,$(SONAME) -shared -o $@ $^ $(LIBS)
 
-blkdump: $(LIB)
+blkdump: libblktap.so
        $(CC) $(CFLAGS) -o blkdump -L$(XEN_LIBXC) -L. -l blktap blkdump.c
 
-parallax: $(LIB) $(PLX_SRCS)
-       $(CC) $(CFLAGS) -o parallax -L$(XEN_LIBXC) -L. -lblktap $(LIBS) 
$(PLX_SRCS) 
+.PHONY: TAGS clean install mk-symlinks rpm
 
-vdi_list: $(LIB) vdi_list.c $(VDI_SRCS)
-       $(CC) $(CFLAGS) -g3 -o vdi_list vdi_list.c $(LIBS) $(VDI_SRCS)
-
-vdi_create: $(LIB) vdi_create.c $(VDI_SRCS)
-       $(CC) $(CFLAGS) -g3 -o vdi_create vdi_create.c $(LIBS) $(VDI_SRCS)
-
-vdi_snap: $(LIB) vdi_snap.c $(VDI_SRCS)
-       $(CC) $(CFLAGS) -g3 -o vdi_snap vdi_snap.c $(LIBS) $(VDI_SRCS)
-
-vdi_snap_list: $(LIB) vdi_snap_list.c $(VDI_SRCS)
-       $(CC) $(CFLAGS) -g3 -o vdi_snap_list vdi_snap_list.c $(LIBS) $(VDI_SRCS)
-
-vdi_snap_delete: $(LIB) vdi_snap_delete.c $(VDI_SRCS)
-       $(CC) $(CFLAGS) -g3 -o vdi_snap_delete vdi_snap_delete.c $(LIBS) 
$(VDI_SRCS)
-
-vdi_tree: $(LIB) vdi_tree.c $(VDI_SRCS)
-       $(CC) $(CFLAGS) -g3 -o vdi_tree vdi_tree.c $(LIBS) $(VDI_SRCS)
-
-vdi_fill: $(LIB) vdi_fill.c $(VDI_SRCS)
-       $(CC) $(CFLAGS) -g3 -o vdi_fill vdi_fill.c $(LIBS) $(VDI_SRCS)
-
-vdi_validate: $(LIB) vdi_validate.c $(VDI_SRCS)
-       $(CC) $(CFLAGS) -g3 -o vdi_validate vdi_validate.c $(LIBS) $(VDI_SRCS)
-
-vdi_unittest: $(LIB) vdi_unittest.c $(VDI_SRCS)
-       $(CC) $(CFLAGS) -g3 -o vdi_unittest vdi_unittest.c $(LIBS) $(VDI_SRCS)
-
-blockstored: blockstored.c
-       $(CC) $(CFLAGS) -g3 -o blockstored $(LIBS) blockstored.c
-bstest: bstest.c blockstore.c
-       $(CC) $(CFLAGS) -g3 -o bstest bstest.c $(LIBS) blockstore.c
-
-.PHONY: TAGS clean install mk-symlinks rpm
 TAGS:
        etags -t $(SRCS) *.h
 
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_tree.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/vdi_tree.c  Sun Jul  3 22:36:48 2005
@@ -0,0 +1,132 @@
+/**************************************************************************
+ * 
+ * vdi_tree.c
+ *
+ * Output current vdi tree to dot and postscript.
+ *
+ */
+ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+#define GRAPH_DOT_FILE "vdi.dot"
+#define GRAPH_PS_FILE  "vdi.ps"
+
+typedef struct sh_st {
+    snap_id_t     id;
+    struct sh_st *next;
+} sh_t;
+
+#define SNAP_HASHSZ 1024
+sh_t *node_hash[SNAP_HASHSZ];
+#define SNAP_HASH(_id) (((int)(_id)->block^(_id)->index)%SNAP_HASHSZ)
+
+#define SNAPID_EQUAL(_a,_b) \
+    (((_a)->block==(_b)->block) && ((_a)->index==(_b)->index))
+int sh_check_and_add(snap_id_t *id)
+{
+    sh_t **s = &node_hash[SNAP_HASH(id)];
+    
+    while (*s != NULL) {
+        if (SNAPID_EQUAL(&((*s)->id), id))
+            return 1;
+        *s = (*s)->next;
+    }
+    
+    *s = (sh_t *)malloc(sizeof(sh_t));
+    (*s)->id = *id;
+    (*s)->next = NULL;
+    
+    return 0;
+}
+
+int main(int argc, char *argv[])
+{
+    FILE *f;
+    char dot_file[255] = GRAPH_DOT_FILE;
+    char  ps_file[255] = GRAPH_PS_FILE;
+    int nr_vdis = 0, nr_forks = 0;
+    vdi_registry_t *reg;
+    vdi_t *vdi;
+    int i;
+    
+    __init_blockstore();
+    __init_vdi();
+    
+    reg = get_vdi_registry();
+    
+    if ( reg == NULL ) {
+        printf("couldn't get VDI registry.\n");
+        exit(-1);
+    }
+    
+    if ( argc > 1 ) {
+        strncpy(ps_file, argv[1], 255);
+        ps_file[255] = '\0';
+    }
+    
+    /* now dump it out to a dot file. */
+    printf("[o] Dumping state to a dot graph. (%d VDIs)\n", nr_vdis);
+    
+    f = fopen(dot_file, "w");
+    
+    /* write graph preamble */
+    fprintf(f, "digraph G {\n");
+    fprintf(f, "   rankdir=LR\n");
+    
+    for (i=0; i<reg->nr_vdis; i++) {
+        char oldnode[255];
+        snap_block_t *blk;
+        snap_id_t id;
+        int nr_snaps, done=0;
+        
+        vdi = vdi_get(i);
+        id = vdi->snap;
+        /* add a node for the id */
+printf("vdi: %d\n", i);
+        fprintf(f, "   n%Ld%d 
[color=blue,shape=box,label=\"%s\\nb:%Ld\\nidx:%d\"]\n", 
+                id.block, id.index, vdi->name,
+                id.block, id.index);
+        sprintf(oldnode, "n%Ld%d", id.block, id.index);
+        
+        while (id.block != 0) {
+            blk = snap_get_block(id.block);
+            nr_snaps = blk->hdr.log_entries - (blk->hdr.nr_entries - id.index);
+            id = blk->hdr.fork_block;
+            
+            done = sh_check_and_add(&id);
+            
+            /* add a node for the fork_id */
+            if (!done) {
+                fprintf(f, "   n%Ld%d [shape=box,label=\"b:%Ld\\nidx:%d\"]\n", 
+                    id.block, id.index,
+                    id.block, id.index);
+            }
+            
+            /* add an edge between them */
+            fprintf(f, "   n%Ld%d -> %s [label=\"%u snapshots\"]\n",
+                    id.block, id.index, oldnode, nr_snaps);
+            sprintf(oldnode, "n%Ld%d", id.block, id.index);
+            freeblock(blk);
+            
+            if (done) break;
+        }
+    }
+    
+    /* write graph postamble */
+    fprintf(f, "}\n");
+    fclose(f);
+    
+    printf("[o] Generating postscript graph. (%s)\n", GRAPH_PS_FILE);
+    {
+        char cmd[255];
+        sprintf(cmd, "dot %s -Tps -o %s", dot_file, ps_file);
+        system(cmd);
+    }
+    return 0;
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/snaplog.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/snaplog.c   Sun Jul  3 22:36:48 2005
@@ -0,0 +1,238 @@
+/**************************************************************************
+ * 
+ * snaplog.c
+ *
+ * Snapshot log on-disk data structure.
+ *
+ */
+ 
+ /* VDI histories are made from chains of snapshot logs.  These logs record 
+  * the (radix) root and timestamp of individual snapshots.
+  *
+  * creation of a new VDI involves 'forking' a snapshot log, by creating a 
+  * new, empty log (in a new VDI) and parenting it off of a record in an 
+  * existing snapshot log.
+  *
+  * snapshot log blocks have at most one writer.
+  */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "snaplog.h"
+
+
+
+snap_block_t *snap_get_block(u64 block)
+{
+    snap_block_t *blk = (snap_block_t *)readblock(block);
+    
+    if ( blk == NULL)
+        return NULL;
+    if ( blk->hdr.magic != SNAP_MAGIC ) {
+        freeblock(blk);
+        return NULL;
+    }
+    
+    return blk;
+}
+    
+int snap_get_id(snap_id_t *id, snap_rec_t *target)
+{
+    snap_block_t *blk;
+    
+    if ( id == NULL )
+        return -1;
+    
+    blk = snap_get_block(id->block);
+    
+    if ( blk == NULL ) 
+        return -1;
+    
+    if ( id->index > blk->hdr.nr_entries ) {
+        freeblock(blk);
+        return -1;
+    }
+    
+    *target = blk->snaps[id->index];
+    freeblock(blk);
+    return 0;
+}
+
+int __snap_block_create(snap_id_t *parent_id, snap_id_t *fork_id,
+                                  snap_id_t *new_id)
+{
+    snap_rec_t parent_rec, fork_rec;
+    snap_block_t *blk, *pblk;
+    /*
+    if ( (parent_id != NULL) && (snap_get_id(parent_id, &parent_rec) != 0) )
+        return -1;    
+    
+    if ( (fork_id != NULL) && (snap_get_id(fork_id, &fork_rec) != 0) )
+        return -1;   
+*/
+    blk = (snap_block_t *)newblock();
+    blk->hdr.magic  = SNAP_MAGIC;
+    blk->hdr.nr_entries  = 0;
+    blk->hdr.log_entries = 0;
+    blk->hdr.immutable   = 0;
+    
+    if (   (parent_id  != NULL) 
+        && (parent_id->block != fork_id->block) 
+        && (parent_id->block != 0)) {
+        
+        pblk = snap_get_block(parent_id->block);
+        blk->hdr.log_entries = pblk->hdr.log_entries;
+        freeblock(pblk);
+    }
+    
+    if (parent_id != NULL) {
+        blk->hdr.parent_block = *parent_id;
+        blk->hdr.fork_block   = *fork_id;
+    } else {
+        blk->hdr.parent_block = null_snap_id;
+        blk->hdr.fork_block   = null_snap_id;
+    }
+    
+    new_id->index = 0;
+    new_id->block = allocblock(blk);
+    freeblock(blk);
+    if (new_id->block == 0)
+        return -1;
+    
+    return 0;
+}
+
+int snap_block_create(snap_id_t *parent_id, snap_id_t *new_id)
+{
+    return __snap_block_create(parent_id, parent_id, new_id);
+}
+
+int snap_append(snap_id_t *old_id, snap_rec_t *rec, snap_id_t *new_id)
+{
+    snap_id_t id = *old_id;
+    snap_block_t *blk = snap_get_block(id.block);
+    
+    if ( rec->deleted == 1 ) {
+        printf("Attempt to append a deleted snapshot!\n");
+        return -1;
+    }
+    
+    if ( blk->hdr.immutable != 0 ) {
+        printf("Attempt to snap an immutable snap block!\n");
+        return -1;
+    }
+    
+    new_id->block = id.block;
+    
+    if (blk->hdr.nr_entries == SNAPS_PER_BLOCK) {
+        int ret;
+        
+        id.index--; /* make id point to the last full record */
+        
+        ret = __snap_block_create(&id, &blk->hdr.fork_block, new_id);
+        if ( ret != 0 ) {
+            freeblock(blk);
+            return -1;
+        }
+        
+        blk->hdr.immutable = 1;
+        writeblock(id.block, blk);
+        freeblock(blk);
+        blk = snap_get_block(new_id->block);
+        id = *new_id;
+    }
+    
+    blk->snaps[blk->hdr.nr_entries] = *rec;
+    blk->hdr.nr_entries++;
+    blk->hdr.log_entries++;
+    new_id->index = blk->hdr.nr_entries;
+    //printf("snap: %u %u\n", blk->hdr.nr_entries, blk->hdr.log_entries);
+    writeblock(id.block, blk);
+    freeblock(blk);
+    return 0;
+}
+
+int snap_collapse(int height, snap_id_t *p_id, snap_id_t *c_id)
+{
+    snap_block_t *p_blk, *c_blk, *blk;
+    snap_rec_t   *p_rec, *c_rec;
+    int ret = -1;
+    
+    p_blk = snap_get_block(p_id->block);
+    
+    if (p_blk == NULL) return(-1);
+    
+    if (c_id->block == p_id->block)
+    {
+        c_blk = p_blk;
+    } else {
+         c_blk = snap_get_block(c_id->block);
+    }
+    
+    if (p_blk == NULL) {
+        freeblock(p_blk);
+        return(-1);
+    }
+     
+    /* parent and child must not be deleted. */
+    p_rec = &p_blk->snaps[p_id->index];
+    c_rec = &c_blk->snaps[c_id->index];
+    /*
+    if ( (p_rec->deleted == 1) || (c_rec->deleted == 1) ) {
+        printf("One of those snaps is already deleted.\n");
+        goto done;
+    }
+    */
+    /* first non-deleted thing in the log before child must be parent. */
+    
+    /* XXX todo: text the range here for delete (and eventually fork) bits) */
+    /* for now, snaps must be consecutive, on the same log page: */
+    
+    if ((p_id->block != c_id->block) || (p_id->index != c_id->index-1))
+    {
+        printf("Deleting non-consecutive snaps is not done yet.\n");
+        goto done;
+    }
+    
+    /* mark parent as deleted XXX: may need to lock parent block here.*/
+    p_rec->deleted = 1;
+    writeblock(p_id->block, p_blk);
+    
+    /* delete the parent */
+    printf("collapse(%Ld, %Ld)\n", p_rec->radix_root, c_rec->radix_root);
+    ret = collapse(height, p_rec->radix_root, c_rec->radix_root);
+    
+    /* return the number of blocks reclaimed. */
+    
+done:
+    if (c_blk != p_blk) freeblock(c_blk);
+    freeblock(p_blk);
+    
+    return(ret);
+}
+
+void snap_print_history(snap_id_t *snap_id)
+{
+    snap_id_t id = *snap_id;
+    unsigned int idx = id.index;
+    snap_block_t *new_blk, *blk = snap_get_block(id.block);
+    
+    while ( blk ) {
+        printf("[Snap block %Ld]:\n", id.block);
+        do {
+            printf("   %03u: root: %Ld ts: %ld.%ld\n", idx, 
+                    blk->snaps[idx].radix_root,
+                    blk->snaps[idx].timestamp.tv_sec,
+                    blk->snaps[idx].timestamp.tv_usec);
+        } while (idx-- != 0);
+        
+        id = blk->hdr.parent_block;
+        if (id.block != 0) {
+            new_blk = snap_get_block(id.block);
+        }
+        freeblock(blk);
+        blk = new_blk;
+    }
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/snaplog.h
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/snaplog.h   Sun Jul  3 22:36:48 2005
@@ -0,0 +1,61 @@
+/**************************************************************************
+ * 
+ * snaplog.h
+ *
+ * Snapshot log on-disk data structure.
+ *
+ */
+ 
+#include "radix.h"
+#include "blockstore.h"    /* for BLOCK_SIZE */
+ 
+#ifndef __SNAPLOG_H__
+#define __SNAPLOG_H__
+
+typedef struct snap_id {
+    u64            block;
+    unsigned int   index;
+} snap_id_t;
+
+typedef struct snap_rec {
+    u64            radix_root;
+    struct timeval timestamp;
+    /* flags: */
+    unsigned       deleted:1;
+} snap_rec_t;
+
+
+int  snap_block_create(snap_id_t *parent_id, snap_id_t *new_id);
+int  snap_append(snap_id_t *id, snap_rec_t *rec, snap_id_t *new_id);
+int  snap_collapse(int height, snap_id_t *p_id, snap_id_t *c_id);
+void snap_print_history(snap_id_t *snap_id);
+int  snap_get_id(snap_id_t *id, snap_rec_t *target);
+
+
+/* exported for vdi debugging */
+#define SNAP_MAGIC 0xff00ff0aa0ff00ffLL
+
+static const snap_id_t null_snap_id = { 0, 0 }; 
+
+typedef struct snap_block_hdr {
+    u64            magic;
+    snap_id_t      parent_block; /* parent block within this chain */
+    snap_id_t      fork_block;   /* where this log was forked */
+    unsigned       log_entries;  /* total entries since forking */
+    unsigned short nr_entries;   /* entries in snaps[] */
+    unsigned short immutable;    /* has this snap page become immutable? */
+} snap_block_hdr_t;
+
+
+#define SNAPS_PER_BLOCK \
+    ((BLOCK_SIZE - sizeof(snap_block_hdr_t)) / sizeof(snap_rec_t))
+
+typedef struct snap_block {
+    snap_block_hdr_t hdr;
+    snap_rec_t       snaps[SNAPS_PER_BLOCK];
+} snap_block_t;
+    
+
+snap_block_t *snap_get_block(u64 block);
+
+#endif /* __SNAPLOG_H__ */
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/README
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/README      Sun Jul  3 22:36:48 2005
@@ -0,0 +1,177 @@
+Parallax Quick Overview
+March 3, 2005
+
+This is intended to provide a quick set of instructions to let you
+guys play with the current parallax source.  In it's current form, the
+code will let you run an arbitrary number of VMs off of a single disk
+image, doing copy-on-write as they make updates.  Each domain is
+assigned a virtual disk image (VDI), which may be based on a snapshot
+of an existing image.  All of the VDI and snapshot management should
+currently work.
+
+The current implementation uses a single file as a blockstore for
+_everything_ this will soon be replaced by the fancier backend code
+and the local cache.  As it stands, Parallax will create
+"blockstore.dat" in the directory that you run it from, and use
+largefile support to make this grow to unfathomable girth.  So, you
+probably want to run the daemon off of a local disk, with a lot of
+free space.
+
+Here's how to get going:
+
+0. Setup:
+---------
+
+Pick a local directory on a disk with lots of room.  You should be
+running from a privileged domain (e.g. dom0) with the blocktap
+configured in and block backend NOT.
+
+For convenience (for the moment) copy all of the vdi tools (vdi_*) and
+the parallax daemon from tools/blktap into this directory.
+
+1. Populate the blockstore:
+---------------------------
+
+First you need to put at least one image into the blockstore.  You
+will need a disk image, either as a file or local partition.  My
+general approach has been to
+
+(a) make a really big sparse file with 
+
+        dd if=/dev/zero of=./image bs=4K count=1 seek=[big value]
+
+(b) put a filesystem into it
+
+        mkfs.ext3 ./image
+
+(c) mount it using loopback
+
+        mkdir ./mnt
+        mount -o loop ./image
+
+(d) cd into it and untar one of the image files from srg-roots.
+
+        cd mnt
+        tar ...
+
+NOTE: Beware if your system is FC3.  mkfs is not compatible with old
+versions of fedora, and so you don't have much choice but to install
+further fc3 images if you have used the fc3 version of mkfs.
+
+(e) unmount the image
+
+        cd ..
+        umount mnt
+
+(f) now, create a new VDI to hold the image 
+
+        ./vdi_create "My new FC3 VDI"
+
+(g) get the id of the new VDI.
+
+        ./vdi_list
+
+        |      0                     My new FC3 VDI
+
+(0 is the VDI id... create a few more if you want.)
+
+(h) hoover your image into the new VDI.
+
+        ./vdi_fill 0 ./image
+
+This will pull the entire image into the blockstore and set up a
+mapping tree for it for VDI 0.  Passing a device (i.e. /dev/sda3)
+should also work, but vdi_fill has NO notion of sparseness yet, so you
+are going to pump a block into the store for each block you read.
+
+vdi_fill will count up until it is done, and you should be ready to
+go.  If you want to be anal, you can use vdi_validate to test the VDI
+against the original image.
+
+2. Create some extra VDIs
+-------------------------
+
+VDIs are actually a list of snapshots, and each snapshot is a full
+image of mappings.  So, to preserve an immutable copy of a current
+VDI, do this:
+
+(a) Snapshot your new VDI.
+
+        ./vdi_snap 0
+
+Snapshotting writes the current radix root to the VDI's snapshot log,
+and assigns it a new writable root.
+
+(b) look at the VDI's snapshot log.
+
+        ./vdi_snap_list 0
+
+        | 16   0      Thu Mar  3 19:27:48 2005 565111           31
+
+The first two columns constitute a snapshot id and represent the
+(block, offset) of the snapshot record.  The Date tells you when the
+snapshot was made, and 31 is the radix root node of the snapshot.
+
+(c) Create a new VDI, based on that snapshot, and look at the list.
+
+        ./vdi_create "FC3 - Copy 1" 16 0
+        ./vdi_list
+
+        |      0                     My new FC3 VDI
+        |      1                       FC3 - Copy 1
+
+NOTE: If you have Graphviz installed on your system, you can use
+vdi_tree to generate a postscript of your current set of VDIs and
+snapshots.
+
+
+Create as many VDIs as you need for the VMs that you want to run.
+
+3. Boot some VMs:
+-----------------
+
+Parallax currently uses a hack in xend to pass the VDI id, you need to
+modify the disk line of the VM config that is going to mount it.
+
+(a) set up your vm config, by using the following disk line:
+
+        disk = ['parallax:1,sda1,w,0' ]
+
+This example uses VDI 1 (from vdi_list above), presents it as sda1
+(writable), and uses dom 0 as the backend.  If you were running the
+daemon (and tap driver) in some domain other than 0, you would change
+this last parameter.
+
+NOTE: You'll need to have reinstalled xend/tools prior to booting the vm, so 
that it knows what to do with "parallax:".
+
+(b) Run parallax in the backend domain.
+
+        ./parallax
+
+(c) create your new domain.
+
+        xm create ...
+
+---
+
+That's pretty much all there is to it at the moment.  Hope this is
+clear enough to get you going.  Now, a few serious caveats that will
+be sorted out in the almost immediate future:
+
+WARNINGS:
+---------
+
+1. There is NO locking in the VDI tools at the moment, so I'd avoid
+running them in parallel, or more importantly, running them while the
+daemon is running.
+
+2. I doubt that xend will be very happy about restarting if you have
+parallax-using domains.  So if it dies while there are active parallax
+doms, you may need to reboot.
+
+3. I've turned off write-in-place.  So at the moment, EVERY block
+write is a log append on the blockstore.  I've been having some probs
+with the radix tree's marking of writable blocks after snapshots and
+will sort this out very soon.
+
+
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/bstest.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/bstest.c    Sun Jul  3 22:36:48 2005
@@ -0,0 +1,191 @@
+/**************************************************************************
+ * 
+ * bstest.c
+ *
+ * Block store daemon test program.
+ *
+ * usage: bstest <host>|X {r|w|a} ID 
+ *
+ */
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <errno.h>
+#include "blockstore.h"
+
+int direct(char *host, u32 op, u64 id, int len) {
+    struct sockaddr_in sn, peer;
+    int sock;
+    bsmsg_t msgbuf;
+    int rc, slen;
+    struct hostent *addr;
+
+    addr = gethostbyname(host);
+    if (!addr) {
+        perror("bad hostname");
+        exit(1);
+    }
+    peer.sin_family = addr->h_addrtype;
+    peer.sin_port = htons(BLOCKSTORED_PORT);
+    peer.sin_addr.s_addr =  ((struct in_addr *)(addr->h_addr))->s_addr;
+    fprintf(stderr, "Sending to: %u.%u.%u.%u\n",
+            (unsigned int)(unsigned char)addr->h_addr[0],
+            (unsigned int)(unsigned char)addr->h_addr[1],
+            (unsigned int)(unsigned char)addr->h_addr[2],
+            (unsigned int)(unsigned char)addr->h_addr[3]);
+
+    sock = socket(AF_INET, SOCK_DGRAM, 0);
+    if (sock < 0) {
+        perror("Bad socket");
+        exit(1);
+    }
+    memset(&sn, 0, sizeof(sn));
+    sn.sin_family = AF_INET;
+    sn.sin_port = htons(BLOCKSTORED_PORT);
+    sn.sin_addr.s_addr = htonl(INADDR_ANY);
+    if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) {
+        perror("bind");
+        close(sock);
+        exit(1);
+    }
+
+    memset((void *)&msgbuf, 0, sizeof(msgbuf));
+    msgbuf.operation = op;
+    msgbuf.id = id;
+
+    rc = sendto(sock, (void *)&msgbuf, len, 0,
+                (struct sockaddr *)&peer, sizeof(peer));
+    if (rc < 0) {
+        perror("sendto");
+        exit(1);
+    }
+
+    slen = sizeof(peer);
+    len = recvfrom(sock, (void *)&msgbuf, sizeof(msgbuf), 0,
+                   (struct sockaddr *)&peer, &slen);
+    if (len < 0) {
+        perror("recvfrom");
+        exit(1);
+    }
+
+    printf("Reply %u bytes:\n", len);
+    if (len >= MSGBUFSIZE_OP)
+        printf("  operation: %u\n", msgbuf.operation);
+    if (len >= MSGBUFSIZE_FLAGS)
+        printf("  flags: 0x%x\n", msgbuf.flags);
+    if (len >= MSGBUFSIZE_ID)
+        printf("  id: %llu\n", msgbuf.id);
+    if (len >= (MSGBUFSIZE_ID + 4))
+        printf("  data: %02x %02x %02x %02x...\n",
+               (unsigned int)msgbuf.block[0],
+               (unsigned int)msgbuf.block[1],
+               (unsigned int)msgbuf.block[2],
+               (unsigned int)msgbuf.block[3]);
+    
+    if (sock > 0)
+        close(sock);
+   
+    return 0;
+}
+
+int main (int argc, char **argv) {
+
+    u32 op = 0;
+    u64 id = 0;
+    int len = 0, rc;
+    void *block;
+
+    if (argc < 3) {
+        fprintf(stderr, "usage: bstest <host>|X {r|w|a} ID\n");
+        return 1;
+    }
+
+    switch (argv[2][0]) {
+    case 'r':
+    case 'R':
+        op = BSOP_READBLOCK;
+        len = MSGBUFSIZE_ID;
+        break;
+    case 'w':
+    case 'W':
+        op = BSOP_WRITEBLOCK;
+        len = MSGBUFSIZE_BLOCK;
+        break;
+    case 'a':
+    case 'A':
+        op = BSOP_ALLOCBLOCK;
+        len = MSGBUFSIZE_BLOCK;
+        break;
+    default:
+        fprintf(stderr, "Unknown action '%s'.\n", argv[2]);
+        return 1;
+    }
+
+    if (argc >= 4)
+        id = atoll(argv[3]);
+
+    if (strcmp(argv[1], "X") == 0) {
+        rc = __init_blockstore();
+        if (rc < 0) {
+            fprintf(stderr, "blockstore init failed.\n");
+            return 1;
+        }
+        switch(op) {
+        case BSOP_READBLOCK:
+            block = readblock(id);
+            if (block) {
+                printf("data: %02x %02x %02x %02x...\n",
+                       (unsigned int)((unsigned char*)block)[0],
+                       (unsigned int)((unsigned char*)block)[1],
+                       (unsigned int)((unsigned char*)block)[2],
+                       (unsigned int)((unsigned char*)block)[3]);
+            }
+            break;
+        case BSOP_WRITEBLOCK:
+            block = malloc(BLOCK_SIZE);
+            if (!block) {
+                perror("bstest malloc");
+                return 1;
+            }
+            memset(block, 0, BLOCK_SIZE);
+            rc = writeblock(id, block);
+            if (rc != 0) {
+                printf("error\n");
+            }
+            else {
+                printf("OK\n");
+            }
+            break;
+        case BSOP_ALLOCBLOCK:
+            block = malloc(BLOCK_SIZE);
+            if (!block) {
+                perror("bstest malloc");
+                return 1;
+            }
+            memset(block, 0, BLOCK_SIZE);
+            id = allocblock_hint(block, id);
+            if (id == 0) {
+                printf("error\n");
+            }
+            else {
+                printf("ID: %llu\n", id);
+            }
+            break;
+        }
+    }
+    else {
+        direct(argv[1], op, id, len);
+    }
+
+
+    return 0;
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_snap_delete.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/vdi_snap_delete.c   Sun Jul  3 22:36:48 2005
@@ -0,0 +1,48 @@
+/**************************************************************************
+ * 
+ * vdi_snap_delete.c
+ *
+ * Delete a snapshot.
+ *
+ * This is not finished:  right now it takes a snap n and calls 
+ * snap_collapse(n,n+1).
+ *
+ * TODO: support for non-consecutive, non-same-block snaps
+ *       Avoid forking probs.
+ *
+ */
+ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "snaplog.h"
+#include "radix.h"
+#include "vdi.h"
+
+int main(int argc, char *argv[])
+{
+    snap_id_t    id, c_id;
+    int ret;
+    
+    __init_blockstore();
+    __init_vdi();
+    
+    if ( argc != 3 ) {
+        printf("usage: %s <snap block> <snap idx>\n", argv[0]);
+        exit(-1);
+    }
+    
+    id.block   = (u64)          atoll(argv[1]);
+    id.index   = (unsigned int) atol (argv[2]);
+    
+    c_id = id;
+    c_id.index++;
+    
+    ret = snap_collapse(VDI_HEIGHT, &id, &c_id);
+    
+    printf("Freed %d blocks.\n", ret);
+    
+    return 0;
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/block-async.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/block-async.c       Sun Jul  3 22:36:48 2005
@@ -0,0 +1,393 @@
+/* block-async.c
+ * 
+ * Asynchronous block wrappers for parallax.
+ */
+ 
+ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include "block-async.h"
+#include "blockstore.h"
+#include "vdi.h"
+
+
+#if 0
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+/* We have a queue of outstanding I/O requests implemented as a 
+ * circular producer-consumer ring with free-running buffers.
+ * to allow reordering, this ring indirects to indexes in an 
+ * ring of io_structs.
+ * 
+ * the block_* calls may either add an entry to this ring and return, 
+ * or satisfy the request immediately and call the callback directly.
+ * None of the io calls in parallax should be nested enough to worry 
+ * about stack problems with this approach.
+ */
+
+struct read_args {
+    u64 addr;
+};
+
+struct write_args {
+    u64   addr;
+    char *block;
+};
+
+struct alloc_args {
+    char *block;
+};
+ 
+struct pending_io_req {
+    enum {IO_READ, IO_WRITE, IO_ALLOC, IO_RWAKE, IO_WWAKE} op;
+    union {
+        struct read_args  r;
+        struct write_args w;
+        struct alloc_args a;
+    } u;
+    io_cb_t cb;
+    void *param;
+};
+
+void radix_lock_init(struct radix_lock *r)
+{
+    int i;
+    
+    pthread_mutex_init(&r->lock, NULL);
+    for (i=0; i < 1024; i++) {
+        r->lines[i] = 0;
+        r->waiters[i] = NULL;
+        r->state[i] = ANY;
+    }
+}
+
+/* maximum outstanding I/O requests issued asynchronously */
+/* must be a power of 2.*/
+#define MAX_PENDING_IO 1024
+
+/* how many threads to concurrently issue I/O to the disk. */
+#define IO_POOL_SIZE   10
+
+static struct pending_io_req pending_io_reqs[MAX_PENDING_IO];
+static int pending_io_list[MAX_PENDING_IO];
+static unsigned long io_prod = 0, io_cons = 0, io_free = 0;
+#define PENDING_IO_MASK(_x) ((_x) & (MAX_PENDING_IO - 1))
+#define PENDING_IO_IDX(_x) ((_x) - pending_io_reqs)
+#define PENDING_IO_ENT(_x) \
+       (&pending_io_reqs[pending_io_list[PENDING_IO_MASK(_x)]])
+#define CAN_PRODUCE_PENDING_IO ((io_free + MAX_PENDING_IO) != io_prod)
+#define CAN_CONSUME_PENDING_IO (io_cons != io_prod)
+static pthread_mutex_t pending_io_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t  pending_io_cond = PTHREAD_COND_INITIALIZER;
+
+static void init_pending_io(void)
+{
+    int i;
+       
+    for (i=0; i<MAX_PENDING_IO; i++)
+        pending_io_list[i] = i;
+               
+} 
+
+void block_read(u64 addr, io_cb_t cb, void *param)
+{
+    struct pending_io_req *req;
+    
+    pthread_mutex_lock(&pending_io_lock);
+    assert(CAN_PRODUCE_PENDING_IO);
+    
+    req = PENDING_IO_ENT(io_prod++);
+    DPRINTF("Produce (R) %lu (%p)\n", io_prod - 1, req);
+    req->op = IO_READ;
+    req->u.r.addr = addr;
+    req->cb = cb;
+    req->param = param;
+    
+    pthread_cond_signal(&pending_io_cond);
+    pthread_mutex_unlock(&pending_io_lock);    
+}
+
+
+void block_write(u64 addr, char *block, io_cb_t cb, void *param)
+{
+    struct pending_io_req *req;
+    
+    pthread_mutex_lock(&pending_io_lock);
+    assert(CAN_PRODUCE_PENDING_IO);
+    
+    req = PENDING_IO_ENT(io_prod++);
+    DPRINTF("Produce (W) %lu (%p)\n", io_prod - 1, req);
+    req->op = IO_WRITE;
+    req->u.w.addr  = addr;
+    req->u.w.block = block;
+    req->cb = cb;
+    req->param = param;
+    
+    pthread_cond_signal(&pending_io_cond);
+    pthread_mutex_unlock(&pending_io_lock);    
+}
+
+
+void block_alloc(char *block, io_cb_t cb, void *param)
+{
+    struct pending_io_req *req;
+       
+    pthread_mutex_lock(&pending_io_lock);
+    assert(CAN_PRODUCE_PENDING_IO);
+    
+    req = PENDING_IO_ENT(io_prod++);
+    req->op = IO_ALLOC;
+    req->u.a.block = block;
+    req->cb = cb;
+    req->param = param;
+    
+    pthread_cond_signal(&pending_io_cond);
+    pthread_mutex_unlock(&pending_io_lock);    
+}
+
+void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
+{
+    struct io_ret ret;
+    pthread_mutex_lock(&r->lock);
+    
+    if (( r->lines[row] >= 0 ) && (r->state[row] != STOP)) {
+        r->lines[row]++;
+        r->state[row] = READ;
+        DPRINTF("RLOCK  : %3d (row: %d)\n", r->lines[row], row);
+        pthread_mutex_unlock(&r->lock);
+        ret.type = IO_INT_T;
+        ret.u.i = 0;
+        cb(ret, param);
+    } else {
+        struct radix_wait **rwc;
+        struct radix_wait *rw = 
+            (struct radix_wait *) malloc (sizeof(struct radix_wait));
+        DPRINTF("RLOCK  : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row);
+        rw->type  = RLOCK;
+        rw->param = param;
+        rw->cb    = cb;
+        rw->next  = NULL;
+        /* append to waiters list. */
+        rwc = &r->waiters[row];
+        while (*rwc != NULL) rwc = &(*rwc)->next;
+        *rwc = rw;
+        pthread_mutex_unlock(&r->lock);
+        return;
+    }
+}
+
+
+void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
+{
+    struct io_ret ret;
+    pthread_mutex_lock(&r->lock);
+    
+    /* the second check here is redundant -- just here for debugging now. */
+    if ((r->state[row] == ANY) && ( r->lines[row] == 0 )) {
+        r->state[row] = STOP;
+        r->lines[row] = -1;
+        DPRINTF("WLOCK  : %3d (row: %d)\n", r->lines[row], row);
+        pthread_mutex_unlock(&r->lock);
+        ret.type = IO_INT_T;
+        ret.u.i = 0;
+        cb(ret, param);
+    } else {
+        struct radix_wait **rwc;
+        struct radix_wait *rw = 
+            (struct radix_wait *) malloc (sizeof(struct radix_wait));
+        DPRINTF("WLOCK  : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row);
+        rw->type  = WLOCK;
+        rw->param = param;
+        rw->cb    = cb;
+        rw->next  = NULL;
+        /* append to waiters list. */
+        rwc = &r->waiters[row];
+        while (*rwc != NULL) rwc = &(*rwc)->next;
+        *rwc = rw;
+        pthread_mutex_unlock(&r->lock);
+        return;
+    }
+       
+}
+
+/* called with radix_lock locked and lock count of zero. */
+static void wake_waiters(struct radix_lock *r, int row)
+{
+    struct pending_io_req *req;
+    struct radix_wait *rw;
+    
+    if (r->lines[row] != 0) return;
+    if (r->waiters[row] == NULL) return; 
+    
+    if (r->waiters[row]->type == WLOCK) {
+
+        rw = r->waiters[row];
+        pthread_mutex_lock(&pending_io_lock);
+        assert(CAN_PRODUCE_PENDING_IO);
+        
+        req = PENDING_IO_ENT(io_prod++);
+        req->op    = IO_WWAKE;
+        req->cb    = rw->cb;
+        req->param = rw->param;
+        r->lines[row] = -1; /* write lock the row. */
+        r->state[row] = STOP;
+        r->waiters[row] = rw->next;
+        free(rw);
+        pthread_mutex_unlock(&pending_io_lock);
+    
+    } else /* RLOCK */ {
+
+        while ((r->waiters[row] != NULL) && (r->waiters[row]->type == RLOCK)) {
+            rw = r->waiters[row];
+            pthread_mutex_lock(&pending_io_lock);
+            assert(CAN_PRODUCE_PENDING_IO);
+            
+            req = PENDING_IO_ENT(io_prod++);
+            req->op    = IO_RWAKE;
+            req->cb    = rw->cb;
+            req->param = rw->param;
+            r->lines[row]++; /* read lock the row. */
+            r->state[row] = READ; 
+            r->waiters[row] = rw->next;
+            free(rw);
+            pthread_mutex_unlock(&pending_io_lock);
+        }
+
+        if (r->waiters[row] != NULL) /* There is a write queued still */
+            r->state[row] = STOP;
+    }  
+    
+    pthread_mutex_lock(&pending_io_lock);
+    pthread_cond_signal(&pending_io_cond);
+    pthread_mutex_unlock(&pending_io_lock);
+}
+
+void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
+{
+    struct io_ret ret;
+       
+    pthread_mutex_lock(&r->lock);
+    assert(r->lines[row] > 0); /* try to catch misuse. */
+    r->lines[row]--;
+    if (r->lines[row] == 0) {
+        r->state[row] = ANY;
+        wake_waiters(r, row);
+    }
+    pthread_mutex_unlock(&r->lock);
+    cb(ret, param);
+}
+
+void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
+{
+    struct io_ret ret;
+    
+    pthread_mutex_lock(&r->lock);
+    assert(r->lines[row] == -1); /* try to catch misuse. */
+    r->lines[row] = 0;
+    r->state[row] = ANY;
+    wake_waiters(r, row);
+    pthread_mutex_unlock(&r->lock);
+    cb(ret, param);
+}
+
+/* consumer calls */
+static void do_next_io_req(struct pending_io_req *req)
+{
+    struct io_ret          ret;
+    void  *param;
+    
+    switch (req->op) {
+    case IO_READ:
+        ret.type = IO_BLOCK_T;
+        ret.u.b  = readblock(req->u.r.addr);
+        break;
+    case IO_WRITE:
+        ret.type = IO_INT_T;
+        ret.u.i  = writeblock(req->u.w.addr, req->u.w.block);
+        DPRINTF("wrote %d at %Lu\n", *(int *)(req->u.w.block), req->u.w.addr);
+        break;
+    case IO_ALLOC:
+        ret.type = IO_ADDR_T;
+        ret.u.a  = allocblock(req->u.a.block);
+        break;
+    case IO_RWAKE:
+        DPRINTF("WAKE DEFERRED RLOCK!\n");
+        ret.type = IO_INT_T;
+        ret.u.i  = 0;
+        break;
+    case IO_WWAKE:
+        DPRINTF("WAKE DEFERRED WLOCK!\n");
+        ret.type = IO_INT_T;
+        ret.u.i  = 0;
+        break;
+    default:
+        DPRINTF("Unknown IO operation on pending list!\n");
+        return;
+    }
+    
+    param = req->param;
+    pthread_mutex_lock(&pending_io_lock);
+    pending_io_list[PENDING_IO_MASK(io_free++)] = PENDING_IO_IDX(req);
+    pthread_mutex_unlock(&pending_io_lock);
+       
+    assert(req->cb != NULL);
+    req->cb(ret, param);
+    
+}
+
+void *io_thread(void *param) 
+{
+    int tid;
+    struct pending_io_req *req;
+    
+    /* Set this thread's tid. */
+    tid = *(int *)param;
+    free(param);
+    
+start:
+    pthread_mutex_lock(&pending_io_lock);
+    while (io_prod == io_cons) {
+        pthread_cond_wait(&pending_io_cond, &pending_io_lock);
+    }
+    
+    if (io_prod == io_cons) {
+        /* unnecessary wakeup. */
+        pthread_mutex_unlock(&pending_io_lock);
+        goto start;
+    }
+    
+    req = PENDING_IO_ENT(io_cons++);
+    pthread_mutex_unlock(&pending_io_lock);
+       
+    do_next_io_req(req);
+    
+    goto start;
+       
+}
+
+static pthread_t io_pool[IO_POOL_SIZE];
+void start_io_threads(void)
+
+{      
+    int i, tid=0;
+    
+    for (i=0; i < IO_POOL_SIZE; i++) {
+        int ret, *t;
+        t = (int *)malloc(sizeof(int));
+        *t = tid++;
+        ret = pthread_create(&io_pool[i], NULL, io_thread, t);
+        if (ret != 0) printf("Error starting thread %d\n", i);
+    }
+       
+}
+
+void init_block_async(void)
+{
+    init_pending_io();
+    start_io_threads();
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_snap_list.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/vdi_snap_list.c     Sun Jul  3 22:36:48 2005
@@ -0,0 +1,82 @@
+/**************************************************************************
+ * 
+ * vdi_snap_list.c
+ *
+ * Print a list of snapshots for the specified vdi.
+ *
+ */
+ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+int main(int argc, char *argv[])
+{
+    vdi_t        *vdi;
+    u64           id;
+    int           i, max_snaps = -1;
+    snap_block_t *blk;
+    snap_id_t     sid;
+    char         *t;
+    
+    __init_blockstore();
+    __init_vdi();
+    
+    if ( argc == 1 ) {
+        printf("usage: %s <VDI id> [max snaps]\n", argv[0]);
+        exit(-1);
+    }
+    
+    id = (u64) atoll(argv[1]);
+    
+    if ( argc > 2 ) {
+        max_snaps = atoi(argv[2]);
+    }
+    
+    vdi = vdi_get(id);
+    
+    if ( vdi == NULL ) {
+        printf("couldn't find the requested VDI.\n");
+        freeblock(vdi);
+        exit(-1);
+    }
+    
+    sid = vdi->snap;
+    sid.index--;
+    
+    //printf("%8s%4s%21s %12s %1s\n", "Block", "idx", "timestamp", 
+    //    "radix root", "d");
+    printf("%8s%4s%37s %12s %1s\n", "Block", "idx", "timestamp", 
+            "radix root", "d");
+     
+    while (sid.block != 0) {
+        blk = snap_get_block(sid.block);
+        for (i = sid.index; i >= 0; i--) {
+            if ( max_snaps == 0  ) {
+                freeblock(blk);
+                goto done;
+            }
+            t = ctime(&blk->snaps[i].timestamp.tv_sec);
+            t[strlen(t)-1] = '\0';
+            //printf("%8Ld%4u%14lu.%06lu %12Ld %1s\n",
+            printf("%8Ld%4u%30s %06lu %12Ld %1s\n",
+                    sid.block, i, 
+                    //blk->snaps[i].timestamp.tv_sec,
+                    t,
+                    blk->snaps[i].timestamp.tv_usec,
+                    blk->snaps[i].radix_root,
+                    blk->snaps[i].deleted ? "*" : " ");
+            if ( max_snaps != -1 ) 
+                max_snaps--;
+        }
+        sid = blk->hdr.parent_block;
+        freeblock(blk);
+    }
+done:            
+    return 0;
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_list.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/vdi_list.c  Sun Jul  3 22:36:48 2005
@@ -0,0 +1,47 @@
+/**************************************************************************
+ * 
+ * vdi_list.c
+ *
+ * Print a list of VDIs on the block store.
+ *
+ */
+ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+int main(int argc, char *argv[])
+{
+    vdi_registry_t *reg;
+    vdi_t *vdi;
+    int i;
+    
+    __init_blockstore();
+    __init_vdi();
+    
+    reg = get_vdi_registry();
+    
+    if ( reg == NULL ) {
+        printf("couldn't get VDI registry.\n");
+        exit(-1);
+    }
+    
+    for (i=0; i < reg->nr_vdis; i++) {
+        vdi = vdi_get(i);
+        
+        if ( vdi != NULL ) {
+            
+            printf("%10Ld %60s\n", vdi->id, vdi->name);
+            freeblock(vdi);
+            
+        }
+    }
+    
+    freeblock(reg);
+    
+    return 0;
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/blockstore.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/blockstore.c        Sun Jul  3 22:36:48 2005
@@ -0,0 +1,1350 @@
+/**************************************************************************
+ * 
+ * blockstore.c
+ *
+ * Simple block store interface
+ *
+ */
+ 
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <stdarg.h>
+#include "blockstore.h"
+#include <pthread.h>
+
+//#define BLOCKSTORE_REMOTE
+//#define BSDEBUG
+
+#define RETRY_TIMEOUT 1000000 /* microseconds */
+
+/*****************************************************************************
+ * Debugging
+ */
+#ifdef BSDEBUG
+void DB(char *format, ...)
+{
+    va_list args;
+    fprintf(stderr, "[%05u] ", (int)pthread_getspecific(tid_key));
+    va_start(args, format);
+    vfprintf(stderr, format, args);
+    va_end(args);
+}
+#else
+#define DB(format, ...) (void)0
+#endif
+
+#ifdef BLOCKSTORE_REMOTE
+
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <netinet/in.h>
+#include <netdb.h>
+
+/*****************************************************************************
+ * Network state                                                             *
+ *****************************************************************************/
+
+/* The individual disk servers we talks to. These will be referenced by
+ * an integer index into bsservers[].
+ */
+bsserver_t bsservers[MAX_SERVERS];
+
+/* The cluster map. This is indexed by an integer cluster number.
+ */
+bscluster_t bsclusters[MAX_CLUSTERS];
+
+/* Local socket.
+ */
+struct sockaddr_in sin_local;
+int bssock = 0;
+
+/*****************************************************************************
+ * Notification                                                              *
+ *****************************************************************************/
+
+typedef struct pool_thread_t_struct {
+    pthread_mutex_t ptmutex;
+    pthread_cond_t ptcv;
+    int newdata;
+} pool_thread_t;
+
+pool_thread_t pool_thread[READ_POOL_SIZE+1];
+
+#define RECV_NOTIFY(tid) { \
+    pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \
+    pool_thread[tid].newdata = 1; \
+    DB("CV Waking %u", tid); \
+    pthread_cond_signal(&(pool_thread[tid].ptcv)); \
+    pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); }
+#define RECV_AWAIT(tid) { \
+    pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \
+    if (pool_thread[tid].newdata) { \
+        pool_thread[tid].newdata = 0; \
+        DB("CV Woken %u", tid); \
+    } \
+    else { \
+        DB("CV Waiting %u", tid); \
+        pthread_cond_wait(&(pool_thread[tid].ptcv), \
+                          &(pool_thread[tid].ptmutex)); \
+    } \
+    pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); }
+
+/*****************************************************************************
+ * Message queue management                                                  *
+ *****************************************************************************/
+
+/* Protects the queue manipulation critcal regions.
+ */
+pthread_mutex_t ptmutex_queue;
+#define ENTER_QUEUE_CR pthread_mutex_lock(&ptmutex_queue)
+#define LEAVE_QUEUE_CR pthread_mutex_unlock(&ptmutex_queue)
+
+pthread_mutex_t ptmutex_recv;
+#define ENTER_RECV_CR pthread_mutex_lock(&ptmutex_recv)
+#define LEAVE_RECV_CR pthread_mutex_unlock(&ptmutex_recv)
+
+/* A message queue entry. We allocate one of these for every request we send.
+ * Asynchronous reply reception also used one of these.
+ */
+typedef struct bsq_t_struct {
+    struct bsq_t_struct *prev;
+    struct bsq_t_struct *next;
+    int status;
+    int server;
+    int length;
+    struct msghdr msghdr;
+    struct iovec iov[2];
+    int tid;
+    struct timeval tv_sent;
+    bshdr_t message;
+    void *block;
+} bsq_t;
+
+#define BSQ_STATUS_MATCHED 1
+
+pthread_mutex_t ptmutex_luid;
+#define ENTER_LUID_CR pthread_mutex_lock(&ptmutex_luid)
+#define LEAVE_LUID_CR pthread_mutex_unlock(&ptmutex_luid)
+
+static u64 luid_cnt = 0x1000ULL;
+u64 new_luid(void) {
+    u64 luid;
+    ENTER_LUID_CR;
+    luid = luid_cnt++;
+    LEAVE_LUID_CR;
+    return luid;
+}
+
+/* Queue of outstanding requests.
+ */
+bsq_t *bs_head = NULL;
+bsq_t *bs_tail = NULL;
+int bs_qlen = 0;
+
+/*
+ */
+void queuedebug(char *msg) {
+    bsq_t *q;
+    ENTER_QUEUE_CR;
+    fprintf(stderr, "Q: %s len=%u\n", msg, bs_qlen);
+    for (q = bs_head; q; q = q->next) {
+        fprintf(stderr, "  luid=%016llx server=%u\n",
+                q->message.luid, q->server);
+    }
+    LEAVE_QUEUE_CR;
+}
+
+int enqueue(bsq_t *qe) {
+    ENTER_QUEUE_CR;
+    qe->next = NULL;
+    qe->prev = bs_tail;
+    if (!bs_head)
+        bs_head = qe;
+    else
+        bs_tail->next = qe;
+    bs_tail = qe;
+    bs_qlen++;
+    LEAVE_QUEUE_CR;
+#ifdef BSDEBUG
+    queuedebug("enqueue");
+#endif
+    return 0;
+}
+
+int dequeue(bsq_t *qe) {
+    bsq_t *q;
+    ENTER_QUEUE_CR;
+    for (q = bs_head; q; q = q->next) {
+        if (q == qe) {
+            if (q->prev)
+                q->prev->next = q->next;
+            else 
+                bs_head = q->next;
+            if (q->next)
+                q->next->prev = q->prev;
+            else
+                bs_tail = q->prev;
+            bs_qlen--;
+            goto found;
+        }
+    }
+
+    LEAVE_QUEUE_CR;
+#ifdef BSDEBUG
+    queuedebug("dequeue not found");
+#endif
+    return 0;
+
+    found:
+    LEAVE_QUEUE_CR;
+#ifdef BSDEBUG
+    queuedebug("dequeue not found");
+#endif
+    return 1;
+}
+
+bsq_t *queuesearch(bsq_t *qe) {
+    bsq_t *q;
+    ENTER_QUEUE_CR;
+    for (q = bs_head; q; q = q->next) {
+        if ((qe->server == q->server) &&
+            (qe->message.operation == q->message.operation) &&
+            (qe->message.luid == q->message.luid)) {
+
+            if ((q->message.operation == BSOP_READBLOCK) &&
+                ((q->message.flags & BSOP_FLAG_ERROR) == 0)) {
+                q->block = qe->block;
+                qe->block = NULL;
+            }
+            q->length = qe->length;
+            q->message.flags = qe->message.flags;
+            q->message.id = qe->message.id;
+            q->status |= BSQ_STATUS_MATCHED;
+
+            if (q->prev)
+                q->prev->next = q->next;
+            else 
+                bs_head = q->next;
+            if (q->next)
+                q->next->prev = q->prev;
+            else
+                bs_tail = q->prev;
+            q->next = NULL;
+            q->prev = NULL;
+            bs_qlen--;
+            goto found;
+        }
+    }
+
+    LEAVE_QUEUE_CR;
+#ifdef BSDEBUG
+    queuedebug("queuesearch not found");
+#endif
+    return NULL;
+
+    found:
+    LEAVE_QUEUE_CR;
+#ifdef BSDEBUG
+    queuedebug("queuesearch found");
+#endif
+    return q;
+}
+
+/*****************************************************************************
+ * Network communication                                                     *
+ *****************************************************************************/
+
+int send_message(bsq_t *qe) {
+    int rc;
+
+    qe->msghdr.msg_name = (void *)&(bsservers[qe->server].sin);
+    qe->msghdr.msg_namelen = sizeof(struct sockaddr_in);
+    qe->msghdr.msg_iov = qe->iov;
+    if (qe->block)
+        qe->msghdr.msg_iovlen = 2;
+    else
+        qe->msghdr.msg_iovlen = 1;
+    qe->msghdr.msg_control = NULL;
+    qe->msghdr.msg_controllen = 0;
+    qe->msghdr.msg_flags = 0;
+
+    qe->iov[0].iov_base = (void *)&(qe->message);
+    qe->iov[0].iov_len = MSGBUFSIZE_ID;
+
+    if (qe->block) {
+        qe->iov[1].iov_base = qe->block;
+        qe->iov[1].iov_len = BLOCK_SIZE;
+    }
+
+    qe->message.luid = new_luid();
+
+    qe->status = 0;
+    qe->tid = (int)pthread_getspecific(tid_key);
+    if (enqueue(qe) < 0) {
+        fprintf(stderr, "Error enqueuing request.\n");
+        return -1;
+    }
+
+    gettimeofday(&(qe->tv_sent), NULL);
+    DB("send_message to %d luid=%016llx\n", qe->server, qe->message.luid);
+    rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
+    //rc = sendto(bssock, (void *)&(qe->message), qe->length, 0,
+    //           (struct sockaddr *)&(bsservers[qe->server].sin),
+    //           sizeof(struct sockaddr_in));
+    if (rc < 0)
+        return rc;
+
+    return rc;
+}
+
+int recv_message(bsq_t *qe) {
+    struct sockaddr_in from;
+    //int flen = sizeof(from);
+    int rc;
+
+    qe->msghdr.msg_name = &from;
+    qe->msghdr.msg_namelen = sizeof(struct sockaddr_in);
+    qe->msghdr.msg_iov = qe->iov;
+    if (qe->block)
+        qe->msghdr.msg_iovlen = 2;
+    else
+        qe->msghdr.msg_iovlen = 1;
+    qe->msghdr.msg_control = NULL;
+    qe->msghdr.msg_controllen = 0;
+    qe->msghdr.msg_flags = 0;
+
+    qe->iov[0].iov_base = (void *)&(qe->message);
+    qe->iov[0].iov_len = MSGBUFSIZE_ID;
+    if (qe->block) {
+        qe->iov[1].iov_base = qe->block;
+        qe->iov[1].iov_len = BLOCK_SIZE;
+    }
+
+    rc = recvmsg(bssock, &(qe->msghdr), 0);
+
+    //return recvfrom(bssock, (void *)&(qe->message), sizeof(bsmsg_t), 0,
+    //               (struct sockaddr *)&from, &flen);
+    return rc;
+}
+
+int get_server_number(struct sockaddr_in *sin) {
+    int i;
+
+#ifdef BSDEBUG2
+    fprintf(stderr,
+            "get_server_number(%u.%u.%u.%u/%u)\n",
+            (unsigned int)sin->sin_addr.s_addr & 0xff,
+            ((unsigned int)sin->sin_addr.s_addr >> 8) & 0xff,
+            ((unsigned int)sin->sin_addr.s_addr >> 16) & 0xff,
+            ((unsigned int)sin->sin_addr.s_addr >> 24) & 0xff,
+            (unsigned int)sin->sin_port);
+#endif
+
+    for (i = 0; i < MAX_SERVERS; i++) {
+        if (bsservers[i].hostname) {
+#ifdef BSDEBUG2
+            fprintf(stderr,
+                    "get_server_number check %u.%u.%u.%u/%u\n",
+                    (unsigned int)bsservers[i].sin.sin_addr.s_addr&0xff,
+                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 8)&0xff,
+                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 
16)&0xff,
+                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 
24)&0xff,
+                    (unsigned int)bsservers[i].sin.sin_port);
+#endif
+            if ((sin->sin_family == bsservers[i].sin.sin_family) &&
+                (sin->sin_port == bsservers[i].sin.sin_port) &&
+                (memcmp((void *)&(sin->sin_addr),
+                        (void *)&(bsservers[i].sin.sin_addr),
+                        sizeof(struct in_addr)) == 0)) {
+                return i;
+            }
+        }        
+    }
+
+    return -1;
+}
+
+void *rx_buffer = NULL;
+bsq_t rx_qe;
+bsq_t *recv_any(void) {
+    struct sockaddr_in from;
+    int rc;
+    
+    DB("ENTER recv_any\n");
+
+    rx_qe.msghdr.msg_name = &from;
+    rx_qe.msghdr.msg_namelen = sizeof(struct sockaddr_in);
+    rx_qe.msghdr.msg_iov = rx_qe.iov;
+    if (!rx_buffer) {
+        rx_buffer = malloc(BLOCK_SIZE);
+        if (!rx_buffer) {
+            perror("recv_any malloc");
+            return NULL;
+        }
+    }
+    rx_qe.block = rx_buffer;
+    rx_buffer = NULL;
+    rx_qe.msghdr.msg_iovlen = 2;
+    rx_qe.msghdr.msg_control = NULL;
+    rx_qe.msghdr.msg_controllen = 0;
+    rx_qe.msghdr.msg_flags = 0;
+    
+    rx_qe.iov[0].iov_base = (void *)&(rx_qe.message);
+    rx_qe.iov[0].iov_len = MSGBUFSIZE_ID;
+    rx_qe.iov[1].iov_base = rx_qe.block;
+    rx_qe.iov[1].iov_len = BLOCK_SIZE;
+
+    rc = recvmsg(bssock, &(rx_qe.msghdr), 0);
+    if (rc < 0) {
+        perror("recv_any");
+        return NULL;
+    }
+
+    rx_qe.length = rc;    
+    rx_qe.server = get_server_number(&from);
+
+    DB("recv_any from %d luid=%016llx len=%u\n",
+       rx_qe.server, rx_qe.message.luid, rx_qe.length);
+
+    return &rx_qe;
+}
+
+void recv_recycle_buffer(bsq_t *q) {
+    if (q->block) {
+        rx_buffer = q->block;
+        q->block = NULL;
+    }
+}
+
+// cycle through reading any incoming, searching for a match in the
+// queue, until we have all we need.
+int wait_recv(bsq_t **reqs, int numreqs) {
+    bsq_t *q, *m;
+    unsigned int x, i;
+    int tid = (int)pthread_getspecific(tid_key);
+
+    DB("ENTER wait_recv %u\n", numreqs);
+
+    checkmatch:
+    x = 0xffffffff;
+    for (i = 0; i < numreqs; i++) {
+        x &= reqs[i]->status;
+    }
+    if ((x & BSQ_STATUS_MATCHED)) {
+        DB("LEAVE wait_recv\n");
+        return numreqs;
+    }
+
+    RECV_AWAIT(tid);
+
+    /*
+    rxagain:
+    ENTER_RECV_CR;
+    q = recv_any();
+    LEAVE_RECV_CR;
+    if (!q)
+        return -1;
+
+    m = queuesearch(q);
+    recv_recycle_buffer(q);
+    if (!m) {
+        fprintf(stderr, "Unmatched RX\n");
+        goto rxagain;
+    }
+    */
+
+    goto checkmatch;
+
+}
+
+/* retry
+ */
+static int retry_count = 0;
+int retry(bsq_t *qe)
+{
+    int rc;
+    gettimeofday(&(qe->tv_sent), NULL);
+    DB("retry to %d luid=%016llx\n", qe->server, qe->message.luid);
+    retry_count++;
+    rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
+    if (rc < 0)
+        return rc;
+    return 0;
+}
+
+/* queue runner
+ */
+void *queue_runner(void *arg)
+{
+    for (;;) {
+        struct timeval now;
+        long long nowus, sus;
+        bsq_t *q;
+        int r;
+
+        sleep(1);
+
+        gettimeofday(&now, NULL);
+        nowus = now.tv_usec + now.tv_sec * 1000000;
+        ENTER_QUEUE_CR;
+        r = retry_count;
+        for (q = bs_head; q; q = q->next) {
+            sus = q->tv_sent.tv_usec + q->tv_sent.tv_sec * 1000000;
+            if ((nowus - sus) > RETRY_TIMEOUT) {
+                if (retry(q) < 0) {
+                    fprintf(stderr, "Error on sendmsg retry.\n");
+                }
+            }
+        }
+        if (r != retry_count) {
+            fprintf(stderr, "RETRIES: %u %u\n", retry_count - r, retry_count);
+        }
+        LEAVE_QUEUE_CR;
+    }
+}
+
+/* receive loop
+ */
+void *receive_loop(void *arg)
+{
+    bsq_t *q, *m;
+
+    for(;;) {
+        q = recv_any();
+        if (!q) {
+            fprintf(stderr, "recv_any error\n");
+        }
+        else {
+            m = queuesearch(q);
+            recv_recycle_buffer(q);
+            if (!m) {
+                fprintf(stderr, "Unmatched RX\n");
+            }
+            else {
+                DB("RX MATCH");
+                RECV_NOTIFY(m->tid);
+            }
+        }
+    }
+}
+pthread_t pthread_recv;
+
+/*****************************************************************************
+ * Reading                                                                   *
+ *****************************************************************************/
+
+void *readblock_indiv(int server, u64 id) {
+    void *block;
+    bsq_t *qe;
+    int len, rc;
+
+    qe = (bsq_t *)malloc(sizeof(bsq_t));
+    if (!qe) {
+        perror("readblock qe malloc");
+        return NULL;
+    }
+    qe->block = NULL;
+    
+    /*
+    qe->block = malloc(BLOCK_SIZE);
+    if (!qe->block) {
+        perror("readblock qe malloc");
+        free((void *)qe);
+        return NULL;
+    }
+    */
+
+    qe->server = server;
+
+    qe->message.operation = BSOP_READBLOCK;
+    qe->message.flags = 0;
+    qe->message.id = id;
+    qe->length = MSGBUFSIZE_ID;
+
+    if (send_message(qe) < 0) {
+        perror("readblock sendto");
+        goto err;
+    }
+    
+    /*len = recv_message(qe);
+    if (len < 0) {
+        perror("readblock recv");
+        goto err;
+    }*/
+
+    rc = wait_recv(&qe, 1);
+    if (rc < 0) {
+        perror("readblock recv");
+        goto err;
+    }
+
+    if ((qe->message.flags & BSOP_FLAG_ERROR)) {
+        fprintf(stderr, "readblock server error\n");
+        goto err;
+    }
+    if (qe->length < MSGBUFSIZE_BLOCK) {
+        fprintf(stderr, "readblock recv short (%u)\n", len);
+        goto err;
+    }
+    /* if ((block = malloc(BLOCK_SIZE)) == NULL) {
+        perror("readblock malloc");
+        goto err;
+    }
+    memcpy(block, qe->message.block, BLOCK_SIZE);
+    */    
+    block = qe->block;
+
+    free((void *)qe);
+    return block;
+
+    err:
+    if (qe->block)
+        free(qe->block);
+    free((void *)qe);
+    return NULL;
+}
+
+/**
+ * readblock: read a block from disk
+ *   @id: block id to read
+ *
+ *   @return: pointer to block, NULL on error
+ */
+void *readblock(u64 id) {
+    int map = (int)BSID_MAP(id);
+    u64 xid;
+    static int i = CLUSTER_MAX_REPLICAS - 1;
+    void *block = NULL;
+
+    /* special case for the "superblock" just use the first block on the
+     * first replica. (extend to blocks < 6 for vdi bug)
+     */
+    if (id < 6) {
+        block = readblock_indiv(bsclusters[map].servers[0], id);
+        goto out;
+    }
+
+    i++;
+    if (i >= CLUSTER_MAX_REPLICAS)
+        i = 0;
+    switch (i) {
+    case 0:
+        xid = BSID_REPLICA0(id);
+        break;
+    case 1:
+        xid = BSID_REPLICA1(id);
+        break;
+    case 2:
+        xid = BSID_REPLICA2(id);
+        break;
+    }
+    
+    block = readblock_indiv(bsclusters[map].servers[i], xid);
+
+    out:
+#ifdef BSDEBUG
+    if (block)
+        fprintf(stderr, "READ:  %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
+                id,
+                (unsigned int)((unsigned char *)block)[0],
+                (unsigned int)((unsigned char *)block)[1],
+                (unsigned int)((unsigned char *)block)[2],
+                (unsigned int)((unsigned char *)block)[3],
+                (unsigned int)((unsigned char *)block)[4],
+                (unsigned int)((unsigned char *)block)[5],
+                (unsigned int)((unsigned char *)block)[6],
+                (unsigned int)((unsigned char *)block)[7]);
+    else
+        fprintf(stderr, "READ:  %016llx NULL\n", id);
+#endif
+    return block;
+}
+
+/*****************************************************************************
+ * Writing                                                                   *
+ *****************************************************************************/
+
+bsq_t *writeblock_indiv(int server, u64 id, void *block) {
+
+    bsq_t *qe;
+    int len;
+
+    qe = (bsq_t *)malloc(sizeof(bsq_t));
+    if (!qe) {
+        perror("writeblock qe malloc");
+        goto err;
+    }
+    qe->server = server;
+
+    qe->message.operation = BSOP_WRITEBLOCK;
+    qe->message.flags = 0;
+    qe->message.id = id;
+    //memcpy(qe->message.block, block, BLOCK_SIZE);
+    qe->block = block;
+    qe->length = MSGBUFSIZE_BLOCK;
+
+    if (send_message(qe) < 0) {
+        perror("writeblock sendto");
+        goto err;
+    }
+
+    return qe;
+
+    err:
+    free((void *)qe);
+    return NULL;
+}
+    
+
+/**
+ * writeblock: write an existing block to disk
+ *   @id: block id
+ *   @block: pointer to block
+ *
+ *   @return: zero on success, -1 on failure
+ */
+int writeblock(u64 id, void *block) {
+    
+    int map = (int)BSID_MAP(id);
+    int rep0 = bsclusters[map].servers[0];
+    int rep1 = bsclusters[map].servers[1];
+    int rep2 = bsclusters[map].servers[2];
+    bsq_t *reqs[3];
+    int rc;
+
+    reqs[0] = reqs[1] = reqs[2] = NULL;
+
+#ifdef BSDEBUG
+    fprintf(stderr,
+            "WRITE: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
+            id,
+            (unsigned int)((unsigned char *)block)[0],
+            (unsigned int)((unsigned char *)block)[1],
+            (unsigned int)((unsigned char *)block)[2],
+            (unsigned int)((unsigned char *)block)[3],
+            (unsigned int)((unsigned char *)block)[4],
+            (unsigned int)((unsigned char *)block)[5],
+            (unsigned int)((unsigned char *)block)[6],
+            (unsigned int)((unsigned char *)block)[7]);
+#endif
+
+    /* special case for the "superblock" just use the first block on the
+     * first replica. (extend to blocks < 6 for vdi bug)
+     */
+    if (id < 6) {
+        reqs[0] = writeblock_indiv(rep0, id, block);
+        if (!reqs[0])
+            return -1;
+        rc = wait_recv(reqs, 1);
+        return rc;
+    }
+
+    reqs[0] = writeblock_indiv(rep0, BSID_REPLICA0(id), block);
+    if (!reqs[0])
+        goto err;
+    reqs[1] = writeblock_indiv(rep1, BSID_REPLICA1(id), block);
+    if (!reqs[1])
+        goto err;
+    reqs[2] = writeblock_indiv(rep2, BSID_REPLICA2(id), block);
+    if (!reqs[2])
+        goto err;
+
+    rc = wait_recv(reqs, 3);
+    if (rc < 0) {
+        perror("writeblock recv");
+        goto err;
+    }
+    if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) {
+        fprintf(stderr, "writeblock server0 error\n");
+        goto err;
+    }
+    if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) {
+        fprintf(stderr, "writeblock server1 error\n");
+        goto err;
+    }
+    if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) {
+        fprintf(stderr, "writeblock server2 error\n");
+        goto err;
+    }
+
+
+    free((void *)reqs[0]);
+    free((void *)reqs[1]);
+    free((void *)reqs[2]);
+    return 0;
+
+    err:
+    if (reqs[0]) {
+        dequeue(reqs[0]);
+        free((void *)reqs[0]);
+    }
+    if (reqs[1]) {
+        dequeue(reqs[1]);
+        free((void *)reqs[1]);
+    }
+    if (reqs[2]) {
+        dequeue(reqs[2]);
+        free((void *)reqs[2]);
+    }
+    return -1;
+}
+
+/*****************************************************************************
+ * Allocation                                                                *
+ *****************************************************************************/
+
+/**
+ * allocblock: write a new block to disk
+ *   @block: pointer to block
+ *
+ *   @return: new id of block on disk
+ */
+u64 allocblock(void *block) {
+    return allocblock_hint(block, 0);
+}
+
+bsq_t *allocblock_hint_indiv(int server, void *block, u64 hint) {
+    bsq_t *qe;
+    int len;
+
+    qe = (bsq_t *)malloc(sizeof(bsq_t));
+    if (!qe) {
+        perror("allocblock_hint qe malloc");
+        goto err;
+    }
+    qe->server = server;
+
+    qe->message.operation = BSOP_ALLOCBLOCK;
+    qe->message.flags = 0;
+    qe->message.id = hint;
+    //memcpy(qe->message.block, block, BLOCK_SIZE);
+    qe->block = block;
+    qe->length = MSGBUFSIZE_BLOCK;
+
+    if (send_message(qe) < 0) {
+        perror("allocblock_hint sendto");
+        goto err;
+    }
+    
+    return qe;
+
+    err:
+    free((void *)qe);
+    return NULL;
+}
+
+/**
+ * allocblock_hint: write a new block to disk
+ *   @block: pointer to block
+ *   @hint: allocation hint
+ *
+ *   @return: new id of block on disk
+ */
+u64 allocblock_hint(void *block, u64 hint) {
+    int map = (int)hint;
+    int rep0 = bsclusters[map].servers[0];
+    int rep1 = bsclusters[map].servers[1];
+    int rep2 = bsclusters[map].servers[2];
+    bsq_t *reqs[3];
+    int rc;
+    u64 id0, id1, id2;
+
+    reqs[0] = reqs[1] = reqs[2] = NULL;
+
+    DB("ENTER allocblock\n");
+
+    reqs[0] = allocblock_hint_indiv(rep0, block, hint);
+    if (!reqs[0])
+        goto err;
+    reqs[1] = allocblock_hint_indiv(rep1, block, hint);
+    if (!reqs[1])
+        goto err;
+    reqs[2] = allocblock_hint_indiv(rep2, block, hint);
+    if (!reqs[2])
+        goto err;
+
+    rc = wait_recv(reqs, 3);
+    if (rc < 0) {
+        perror("allocblock recv");
+        goto err;
+    }
+    if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) {
+        fprintf(stderr, "allocblock server0 error\n");
+        goto err;
+    }
+    if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) {
+        fprintf(stderr, "allocblock server1 error\n");
+        goto err;
+    }
+    if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) {
+        fprintf(stderr, "allocblock server2 error\n");
+        goto err;
+    }
+
+    id0 = reqs[0]->message.id;
+    id1 = reqs[1]->message.id;
+    id2 = reqs[2]->message.id;
+
+#ifdef BSDEBUG
+    fprintf(stderr, "ALLOC: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
+            BSID(map, id0, id1, id2),
+            (unsigned int)((unsigned char *)block)[0],
+            (unsigned int)((unsigned char *)block)[1],
+            (unsigned int)((unsigned char *)block)[2],
+            (unsigned int)((unsigned char *)block)[3],
+            (unsigned int)((unsigned char *)block)[4],
+            (unsigned int)((unsigned char *)block)[5],
+            (unsigned int)((unsigned char *)block)[6],
+            (unsigned int)((unsigned char *)block)[7]);
+#endif
+    
+    free((void *)reqs[0]);
+    free((void *)reqs[1]);
+    free((void *)reqs[2]);
+    return BSID(map, id0, id1, id2);
+
+    err:
+    if (reqs[0]) {
+        dequeue(reqs[0]);
+        free((void *)reqs[0]);
+    }
+    if (reqs[1]) {
+        dequeue(reqs[1]);
+        free((void *)reqs[1]);
+    }
+    if (reqs[2]) {
+        dequeue(reqs[2]);
+        free((void *)reqs[2]);
+    }
+    return 0;
+}
+
+#else /* /BLOCKSTORE_REMOTE */
+
+/*****************************************************************************
+ * Local storage version                                                     *
+ *****************************************************************************/
+ 
+/**
+ * readblock: read a block from disk
+ *   @id: block id to read
+ *
+ *   @return: pointer to block, NULL on error
+ */
+
+void *readblock(u64 id) {
+    void *block;
+    int block_fp;
+   
+//printf("readblock(%llu)\n", id); 
+    block_fp = open("blockstore.dat", O_RDONLY | O_CREAT | O_LARGEFILE, 0644);
+
+    if (block_fp < 0) {
+        perror("open");
+        return NULL;
+    }
+    
+    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
+        printf ("%Ld ", id);
+        printf ("%Ld\n", (id - 1) * BLOCK_SIZE);
+        perror("readblock lseek");
+        goto err;
+    }
+    if ((block = malloc(BLOCK_SIZE)) == NULL) {
+        perror("readblock malloc");
+        goto err;
+    }
+    if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
+        perror("readblock read");
+        free(block);
+        goto err;
+    }
+    close(block_fp);
+    return block;
+    
+err:
+    close(block_fp);
+    return NULL;
+}
+
+/**
+ * writeblock: write an existing block to disk
+ *   @id: block id
+ *   @block: pointer to block
+ *
+ *   @return: zero on success, -1 on failure
+ */
+int writeblock(u64 id, void *block) {
+    
+    int block_fp;
+    
+    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
+
+    if (block_fp < 0) {
+        perror("open");
+        return -1;
+    }
+
+    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
+        perror("writeblock lseek");
+        goto err;
+    }
+    if (write(block_fp, block, BLOCK_SIZE) < 0) {
+        perror("writeblock write");
+        goto err;
+    }
+    close(block_fp);
+    return 0;
+
+err:
+    close(block_fp);
+    return -1;
+}
+
+/**
+ * allocblock: write a new block to disk
+ *   @block: pointer to block
+ *
+ *   @return: new id of block on disk
+ */
+
+u64 allocblock(void *block) {
+    u64 lb;
+    off64_t pos;
+    int block_fp;
+    
+    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
+
+    if (block_fp < 0) {
+        perror("open");
+        return 0;
+    }
+
+    pos = lseek64(block_fp, 0, SEEK_END);
+    if (pos == (off64_t)-1) {
+        perror("allocblock lseek");
+        goto err;
+    }
+    if (pos % BLOCK_SIZE != 0) {
+        fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE);
+        goto err;
+    }
+    if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
+        perror("allocblock write");
+        goto err;
+    }
+    lb = pos / BLOCK_SIZE + 1;
+//printf("alloc(%Ld)\n", lb);
+    close(block_fp);
+    return lb;
+    
+err:
+    close(block_fp);
+    return 0;
+    
+}
+
+/**
+ * allocblock_hint: write a new block to disk
+ *   @block: pointer to block
+ *   @hint: allocation hint
+ *
+ *   @return: new id of block on disk
+ */
+u64 allocblock_hint(void *block, u64 hint) {
+    return allocblock(block);
+}
+
+#endif /* BLOCKSTORE_REMOTE */
+
+/*****************************************************************************
+ * Memory management                                                         *
+ *****************************************************************************/
+
+/**
+ * newblock: get a new in-memory block set to zeros
+ *
+ *   @return: pointer to new block, NULL on error
+ */
+void *newblock() {
+    void *block = malloc(BLOCK_SIZE);
+    if (block == NULL) {
+        perror("newblock");
+        return NULL;
+    }
+    memset(block, 0, BLOCK_SIZE);
+    return block;
+}
+
+
+/**
+ * freeblock: unallocate an in-memory block
+ *   @id: block id (zero if this is only in-memory)
+ *   @block: block to be freed
+ */
+void freeblock(void *block) {
+    if (block != NULL)
+        free(block);
+}
+
+static freeblock_t *new_freeblock(void)
+{
+    freeblock_t *fb;
+    
+    fb = newblock();
+    
+    if (fb == NULL) return NULL;
+    
+    fb->magic = FREEBLOCK_MAGIC;
+    fb->next  = 0ULL;
+    fb->count = 0ULL;
+    memset(fb->list, 0, sizeof fb->list);
+    
+    return fb;
+}
+
+void releaseblock(u64 id)
+{
+    blockstore_super_t *bs_super;
+    freeblock_t *fl_current;
+    
+    /* get superblock */
+    bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
+    
+    /* get freeblock_current */
+    if (bs_super->freelist_current == 0ULL) 
+    {
+        fl_current = new_freeblock();
+        bs_super->freelist_current = allocblock(fl_current);
+        writeblock(BLOCKSTORE_SUPER, bs_super);
+    } else {
+        fl_current = readblock(bs_super->freelist_current);
+    }
+    
+    /* if full, chain to superblock and allocate new current */
+    
+    if (fl_current->count == FREEBLOCK_SIZE) {
+        fl_current->next = bs_super->freelist_full;
+        writeblock(bs_super->freelist_current, fl_current);
+        bs_super->freelist_full = bs_super->freelist_current;
+        freeblock(fl_current);
+        fl_current = new_freeblock();
+        bs_super->freelist_current = allocblock(fl_current);
+        writeblock(BLOCKSTORE_SUPER, bs_super);
+    }
+    
+    /* append id to current */
+    fl_current->list[fl_current->count++] = id;
+    writeblock(bs_super->freelist_current, fl_current);
+    
+    freeblock(fl_current);
+    freeblock(bs_super);
+    
+    
+}
+
+/* freelist debug functions: */
+void freelist_count(int print_each)
+{
+    blockstore_super_t *bs_super;
+    freeblock_t *fb;
+    u64 total = 0, next;
+    
+    bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
+    
+    if (bs_super->freelist_current == 0ULL) {
+        printf("freelist is empty!\n");
+        return;
+    }
+    
+    fb = readblock(bs_super->freelist_current);
+    printf("%Ld entires on current.\n", fb->count);
+    total += fb->count;
+    if (print_each == 1)
+    {
+        int i;
+        for (i=0; i< fb->count; i++)
+            printf("  %Ld\n", fb->list[i]);
+    }
+    
+    freeblock(fb);
+    
+    if (bs_super->freelist_full == 0ULL) {
+        printf("freelist_full is empty!\n");
+        return;
+    }
+    
+    next = bs_super->freelist_full;
+    for (;;) {
+        fb = readblock(next);
+        total += fb->count;
+        if (print_each == 1)
+        {
+            int i;
+            for (i=0; i< fb->count; i++)
+                printf("  %Ld\n", fb->list[i]);
+        }
+        next = fb->next;
+        freeblock(fb);
+        if (next == 0ULL) break;
+    }
+    printf("Total of %Ld ids on freelist.\n", total);
+}
+
+/*****************************************************************************
+ * Initialisation                                                            *
+ *****************************************************************************/
+
+int __init_blockstore(void)
+{
+    int i;
+    blockstore_super_t *bs_super;
+    u64 ret;
+    int block_fp;
+    
+#ifdef BLOCKSTORE_REMOTE
+    struct hostent *addr;
+
+    pthread_mutex_init(&ptmutex_queue, NULL);
+    pthread_mutex_init(&ptmutex_luid, NULL);
+    pthread_mutex_init(&ptmutex_recv, NULL);
+    /*pthread_mutex_init(&ptmutex_notify, NULL);*/
+    for (i = 0; i <= READ_POOL_SIZE; i++) {
+        pool_thread[i].newdata = 0;
+        pthread_mutex_init(&(pool_thread[i].ptmutex), NULL);
+        pthread_cond_init(&(pool_thread[i].ptcv), NULL);
+    }
+
+    bsservers[0].hostname = "firebug.cl.cam.ac.uk";
+    bsservers[1].hostname = "planb.cl.cam.ac.uk";
+    bsservers[2].hostname = "simcity.cl.cam.ac.uk";
+    bsservers[3].hostname = NULL/*"gunfighter.cl.cam.ac.uk"*/;
+    bsservers[4].hostname = NULL/*"galaxian.cl.cam.ac.uk"*/;
+    bsservers[5].hostname = NULL/*"firetrack.cl.cam.ac.uk"*/;
+    bsservers[6].hostname = NULL/*"funfair.cl.cam.ac.uk"*/;
+    bsservers[7].hostname = NULL/*"felix.cl.cam.ac.uk"*/;
+    bsservers[8].hostname = NULL;
+    bsservers[9].hostname = NULL;
+    bsservers[10].hostname = NULL;
+    bsservers[11].hostname = NULL;
+    bsservers[12].hostname = NULL;
+    bsservers[13].hostname = NULL;
+    bsservers[14].hostname = NULL;
+    bsservers[15].hostname = NULL;
+
+    for (i = 0; i < MAX_SERVERS; i++) {
+        if (!bsservers[i].hostname)
+            continue;
+        addr = gethostbyname(bsservers[i].hostname);
+        if (!addr) {
+            perror("bad hostname");
+            return -1;
+        }
+        bsservers[i].sin.sin_family = addr->h_addrtype;
+        bsservers[i].sin.sin_port = htons(BLOCKSTORED_PORT);
+        bsservers[i].sin.sin_addr.s_addr = 
+            ((struct in_addr *)(addr->h_addr))->s_addr;
+    }
+
+    /* Cluster map
+     */
+    bsclusters[0].servers[0] = 0;
+    bsclusters[0].servers[1] = 1;
+    bsclusters[0].servers[2] = 2;
+    bsclusters[1].servers[0] = 1;
+    bsclusters[1].servers[1] = 2;
+    bsclusters[1].servers[2] = 3;
+    bsclusters[2].servers[0] = 2;
+    bsclusters[2].servers[1] = 3;
+    bsclusters[2].servers[2] = 4;
+    bsclusters[3].servers[0] = 3;
+    bsclusters[3].servers[1] = 4;
+    bsclusters[3].servers[2] = 5;
+    bsclusters[4].servers[0] = 4;
+    bsclusters[4].servers[1] = 5;
+    bsclusters[4].servers[2] = 6;
+    bsclusters[5].servers[0] = 5;
+    bsclusters[5].servers[1] = 6;
+    bsclusters[5].servers[2] = 7;
+    bsclusters[6].servers[0] = 6;
+    bsclusters[6].servers[1] = 7;
+    bsclusters[6].servers[2] = 0;
+    bsclusters[7].servers[0] = 7;
+    bsclusters[7].servers[1] = 0;
+    bsclusters[7].servers[2] = 1;
+
+    /* Local socket set up
+     */
+    bssock = socket(AF_INET, SOCK_DGRAM, 0);
+    if (bssock < 0) {
+        perror("Bad socket");
+        return -1;
+    }
+    memset(&sin_local, 0, sizeof(sin_local));
+    sin_local.sin_family = AF_INET;
+    sin_local.sin_port = htons(BLOCKSTORED_PORT);
+    sin_local.sin_addr.s_addr = htonl(INADDR_ANY);
+    if (bind(bssock, (struct sockaddr *)&sin_local, sizeof(sin_local)) < 0) {
+        perror("bind");
+        close(bssock);
+        return -1;
+    }
+
+    pthread_create(&pthread_recv, NULL, receive_loop, NULL);
+    pthread_create(&pthread_recv, NULL, queue_runner, NULL);
+
+#else /* /BLOCKSTORE_REMOTE */
+    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
+
+    if (block_fp < 0) {
+        perror("open");
+        return -1;
+        exit(-1);
+    }
+    
+    if (lseek(block_fp, 0, SEEK_END) == 0) {
+        bs_super = newblock();
+        bs_super->magic            = BLOCKSTORE_MAGIC;
+        bs_super->freelist_full    = 0LL;
+        bs_super->freelist_current = 0LL;
+        
+        ret = allocblock(bs_super);
+        
+        freeblock(bs_super);
+    } else {
+        bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
+        if (bs_super->magic != BLOCKSTORE_MAGIC)
+        {
+            printf("BLOCKSTORE IS CORRUPT! (no magic in superblock!)\n");
+            exit(-1);
+        }
+        freeblock(bs_super);
+    }
+        
+    close(block_fp);
+        
+#endif /*  BLOCKSTORE_REMOTE */   
+    return 0;
+}
+
+void __exit_blockstore(void)
+{
+    int i;
+#ifdef BLOCKSTORE_REMOTE
+    pthread_mutex_destroy(&ptmutex_recv);
+    pthread_mutex_destroy(&ptmutex_luid);
+    pthread_mutex_destroy(&ptmutex_queue);
+    /*pthread_mutex_destroy(&ptmutex_notify);
+      pthread_cond_destroy(&ptcv_notify);*/
+    for (i = 0; i <= READ_POOL_SIZE; i++) {
+        pthread_mutex_destroy(&(pool_thread[i].ptmutex));
+        pthread_cond_destroy(&(pool_thread[i].ptcv));
+    }
+#endif
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/blockstore.h
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/blockstore.h        Sun Jul  3 22:36:48 2005
@@ -0,0 +1,134 @@
+/**************************************************************************
+ * 
+ * blockstore.h
+ *
+ * Simple block store interface
+ *
+ */
+ 
+#ifndef __BLOCKSTORE_H__
+#define __BLOCKSTORE_H__
+
+#include <netinet/in.h>
+#include <xc.h>
+
+#define BLOCK_SIZE  4096
+#define BLOCK_SHIFT   12
+#define BLOCK_MASK  0xfffffffffffff000LL
+
+/* XXX SMH: where is the below supposed to be defined???? */
+#ifndef SECTOR_SHIFT 
+#define SECTOR_SHIFT   9 
+#endif
+
+#define FREEBLOCK_SIZE  (BLOCK_SIZE / sizeof(u64)) - (3 * sizeof(u64))
+#define FREEBLOCK_MAGIC 0x0fee0fee0fee0feeULL
+
+typedef struct {
+    u64 magic;
+    u64 next;
+    u64 count;
+    u64 list[FREEBLOCK_SIZE];
+} freeblock_t; 
+
+#define BLOCKSTORE_MAGIC 0xaaaaaaa00aaaaaaaULL
+#define BLOCKSTORE_SUPER 1ULL
+
+typedef struct {
+    u64 magic;
+    u64 freelist_full;
+    u64 freelist_current;
+} blockstore_super_t;
+
+extern void *newblock();
+extern void *readblock(u64 id);
+extern u64 allocblock(void *block);
+extern u64 allocblock_hint(void *block, u64 hint);
+extern int writeblock(u64 id, void *block);
+
+/* Add this blockid to a freelist, to be recycled by the allocator. */
+extern void releaseblock(u64 id);
+
+/* this is a memory free() operation for block-sized allocations */
+extern void freeblock(void *block);
+extern int __init_blockstore(void);
+
+/* debug for freelist. */
+void freelist_count(int print_each);
+#define ALLOCFAIL (((u64)(-1)))
+
+/* Distribution
+ */
+#define BLOCKSTORED_PORT 9346
+
+struct bshdr_t_struct {
+    u32            operation;
+    u32            flags;
+    u64            id;
+    u64            luid;
+} __attribute__ ((packed));
+typedef struct bshdr_t_struct bshdr_t;
+
+struct bsmsg_t_struct {
+    bshdr_t        hdr;
+    unsigned char  block[BLOCK_SIZE];
+} __attribute__ ((packed));
+
+typedef struct bsmsg_t_struct bsmsg_t;
+
+#define MSGBUFSIZE_OP    sizeof(u32)
+#define MSGBUFSIZE_FLAGS (sizeof(u32) + sizeof(u32))
+#define MSGBUFSIZE_ID    (sizeof(u32) + sizeof(u32) + sizeof(u64) + 
sizeof(u64))
+#define MSGBUFSIZE_BLOCK sizeof(bsmsg_t)
+
+#define BSOP_READBLOCK  0x01
+#define BSOP_WRITEBLOCK 0x02
+#define BSOP_ALLOCBLOCK 0x03
+#define BSOP_FREEBLOCK  0x04
+
+#define BSOP_FLAG_ERROR 0x01
+
+#define BS_ALLOC_SKIP 10
+#define BS_ALLOC_HACK
+
+/* Remote hosts and cluster map - XXX need to generalise
+ */
+
+/*
+
+  Interim ID format is
+
+  63 60 59                40 39                20 19                 0
+  +----+--------------------+--------------------+--------------------+
+  |map | replica 2          | replica 1          | replica 0          |
+  +----+--------------------+--------------------+--------------------+
+
+  The map is an index into a table detailing which machines form the
+  cluster.
+
+ */
+
+#define BSID_REPLICA0(_id) ((_id)&0xfffffULL)
+#define BSID_REPLICA1(_id) (((_id)>>20)&0xfffffULL)
+#define BSID_REPLICA2(_id) (((_id)>>40)&0xfffffULL)
+#define BSID_MAP(_id)      (((_id)>>60)&0xfULL)
+
+#define BSID(_map, _rep0, _rep1, _rep2) ((((u64)(_map))<<60) | \
+                                         (((u64)(_rep2))<<40) | \
+                                         (((u64)(_rep1))<<20) | ((u64)(_rep0)))
+
+typedef struct bsserver_t_struct {
+    char              *hostname;
+    struct sockaddr_in sin;
+} bsserver_t;
+
+#define MAX_SERVERS 16
+
+#define CLUSTER_MAX_REPLICAS 3
+typedef struct bscluster_t_struct {
+    int servers[CLUSTER_MAX_REPLICAS];
+} bscluster_t;
+
+#define MAX_CLUSTERS 16
+
+#endif /* __BLOCKSTORE_H__ */
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/parallax.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/parallax.c  Sun Jul  3 22:36:48 2005
@@ -0,0 +1,611 @@
+/**************************************************************************
+ * 
+ * parallax.c
+ *
+ * The Parallax Storage Server
+ *
+ */
+ 
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include "blktaplib.h"
+#include "blockstore.h"
+#include "vdi.h"
+#include "block-async.h"
+#include "requests-async.h"
+
+#define PARALLAX_DEV     61440
+#define SECTS_PER_NODE   8
+
+
+#if 0
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+/* ------[ session records ]----------------------------------------------- */
+
+#define BLKIF_HASHSZ 1024
+#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1))
+
+#define VDI_HASHSZ 16
+#define VDI_HASH(_vd) ((((_vd)>>8)^(_vd))&(VDI_HASHSZ-1))
+
+typedef struct blkif {
+    domid_t       domid;
+    unsigned int  handle;
+    enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
+    vdi_t        *vdi_hash[VDI_HASHSZ];
+    struct blkif *hash_next;
+} blkif_t;
+
+static blkif_t      *blkif_hash[BLKIF_HASHSZ];
+
+blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle)
+{
+    if ( handle != 0 )
+        printf("blktap/parallax don't currently support non-0 dev handles!\n");
+    
+    blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)];
+    while ( (blkif != NULL) && 
+            ((blkif->domid != domid) || (blkif->handle != handle)) )
+        blkif = blkif->hash_next;
+    return blkif;
+}
+
+vdi_t *blkif_get_vdi(blkif_t *blkif, blkif_vdev_t device)
+{
+    vdi_t *vdi = blkif->vdi_hash[VDI_HASH(device)];
+    
+    while ((vdi != NULL) && (vdi->vdevice != device))
+        vdi = vdi->next;
+    
+    return vdi;
+}
+
+/* ------[ control message handling ]-------------------------------------- */
+
+void blkif_create(blkif_be_create_t *create)
+{
+    domid_t       domid  = create->domid;
+    unsigned int  handle = create->blkif_handle;
+    blkif_t     **pblkif, *blkif;
+
+    DPRINTF("parallax (blkif_create): create is %p\n", create); 
+    
+    if ( (blkif = (blkif_t *)malloc(sizeof(blkif_t))) == NULL )
+    {
+        DPRINTF("Could not create blkif: out of memory\n");
+        create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+        return;
+    }
+
+    memset(blkif, 0, sizeof(*blkif));
+    blkif->domid  = domid;
+    blkif->handle = handle;
+    blkif->status = DISCONNECTED;
+
+    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+    while ( *pblkif != NULL )
+    {
+        if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) )
+        {
+            DPRINTF("Could not create blkif: already exists (%d,%d)\n",
+                domid, handle);
+            create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS;
+            free(blkif);
+            return;
+        }
+        pblkif = &(*pblkif)->hash_next;
+    }
+
+    blkif->hash_next = *pblkif;
+    *pblkif = blkif;
+
+    DPRINTF("Successfully created blkif\n");
+    create->status = BLKIF_BE_STATUS_OKAY;
+}
+
+void blkif_destroy(blkif_be_destroy_t *destroy)
+{
+    domid_t       domid  = destroy->domid;
+    unsigned int  handle = destroy->blkif_handle;
+    blkif_t     **pblkif, *blkif;
+
+    DPRINTF("parallax (blkif_destroy): destroy is %p\n", destroy); 
+    
+    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+    while ( (blkif = *pblkif) != NULL )
+    {
+        if ( (blkif->domid == domid) && (blkif->handle == handle) )
+        {
+            if ( blkif->status != DISCONNECTED )
+                goto still_connected;
+            goto destroy;
+        }
+        pblkif = &blkif->hash_next;
+    }
+
+    destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+    return;
+
+ still_connected:
+    destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
+    return;
+
+ destroy:
+    *pblkif = blkif->hash_next;
+    free(blkif);
+    destroy->status = BLKIF_BE_STATUS_OKAY;
+}
+
+void vbd_create(blkif_be_vbd_create_t *create)
+{
+    blkif_t            *blkif;
+    vdi_t              *vdi, **vdip;
+    blkif_vdev_t        vdevice = create->vdevice;
+
+    DPRINTF("parallax (vbd_create): create=%p\n", create); 
+    
+    blkif = blkif_find_by_handle(create->domid, create->blkif_handle);
+    if ( blkif == NULL )
+    {
+        DPRINTF("vbd_create attempted for non-existent blkif (%u,%u)\n", 
+                create->domid, create->blkif_handle); 
+        create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return;
+    }
+
+    /* VDI identifier is in grow->extent.sector_start */
+    DPRINTF("vbd_create: create->dev_handle (id) is %lx\n", 
+            (unsigned long)create->dev_handle);
+
+    vdi = vdi_get(create->dev_handle);
+    if (vdi == NULL)
+    {
+        printf("parallax (vbd_create): VDI %lx not found.\n",
+               (unsigned long)create->dev_handle);
+        create->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
+        return;
+    }
+    
+    vdi->next = NULL;
+    vdi->vdevice = vdevice;
+    vdip = &blkif->vdi_hash[VDI_HASH(vdevice)];
+    while (*vdip != NULL)
+        vdip = &(*vdip)->next;
+    *vdip = vdi;
+    
+    DPRINTF("blkif_create succeeded\n"); 
+    create->status = BLKIF_BE_STATUS_OKAY;
+}
+
+void vbd_destroy(blkif_be_vbd_destroy_t *destroy)
+{
+    blkif_t            *blkif;
+    vdi_t              *vdi, **vdip;
+    blkif_vdev_t        vdevice = destroy->vdevice;
+    
+    blkif = blkif_find_by_handle(destroy->domid, destroy->blkif_handle);
+    if ( blkif == NULL )
+    {
+        DPRINTF("vbd_destroy attempted for non-existent blkif (%u,%u)\n", 
+                destroy->domid, destroy->blkif_handle); 
+        destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return;
+    }
+
+    vdip = &blkif->vdi_hash[VDI_HASH(vdevice)];
+    while ((*vdip != NULL) && ((*vdip)->vdevice != vdevice))
+        vdip = &(*vdip)->next;
+
+    if (*vdip != NULL) 
+    {
+        vdi = *vdip;
+        *vdip = vdi->next;
+        vdi_put(vdi);
+    }
+        
+}
+
+int parallax_control(control_msg_t *msg)
+{
+    domid_t  domid;
+    int      ret;
+
+    DPRINTF("parallax_control: msg is %p\n", msg); 
+    
+    if (msg->type != CMSG_BLKIF_BE) 
+    {
+        printf("Unexpected control message (%d)\n", msg->type);
+        return 0;
+    }
+    
+    switch(msg->subtype)
+    {
+    case CMSG_BLKIF_BE_CREATE:
+        if ( msg->length != sizeof(blkif_be_create_t) )
+            goto parse_error;
+        blkif_create((blkif_be_create_t *)msg->msg);
+        break;   
+        
+    case CMSG_BLKIF_BE_DESTROY:
+        if ( msg->length != sizeof(blkif_be_destroy_t) )
+            goto parse_error;
+        blkif_destroy((blkif_be_destroy_t *)msg->msg);
+        break;  
+        
+    case CMSG_BLKIF_BE_VBD_CREATE:
+        if ( msg->length != sizeof(blkif_be_vbd_create_t) )
+            goto parse_error;
+        vbd_create((blkif_be_vbd_create_t *)msg->msg);
+        break;
+        
+    case CMSG_BLKIF_BE_VBD_DESTROY:
+        if ( msg->length != sizeof(blkif_be_vbd_destroy_t) )
+            goto parse_error;
+        vbd_destroy((blkif_be_vbd_destroy_t *)msg->msg);
+        break;
+
+    case CMSG_BLKIF_BE_CONNECT:
+    case CMSG_BLKIF_BE_DISCONNECT:
+        /* we don't manage the device channel, the tap does. */
+        break;
+
+    default:
+        goto parse_error;
+    }
+    return 0;
+parse_error:
+    printf("Bad control message!\n");
+    return 0;
+    
+}    
+
+int parallax_probe(blkif_request_t *req, blkif_t *blkif)
+{
+    blkif_response_t *rsp;
+    vdisk_t *img_info;
+    vdi_t *vdi;
+    int i, nr_vdis = 0; 
+
+    DPRINTF("parallax_probe: req=%p, blkif=%p\n", req, blkif); 
+
+    /* We expect one buffer only. */
+    if ( req->nr_segments != 1 )
+      goto err;
+
+    /* Make sure the buffer is page-sized. */
+    if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) ||
+       (blkif_last_sect (req->frame_and_sects[0]) != 7) )
+      goto err;
+
+    /* fill the list of devices */
+    for (i=0; i<VDI_HASHSZ; i++) {
+        vdi = blkif->vdi_hash[i];
+        while (vdi) {
+            img_info = (vdisk_t *)MMAP_VADDR(ID_TO_IDX(req->id), 0);
+            img_info[nr_vdis].device   = vdi->vdevice;
+            img_info[nr_vdis].info     = 0;
+            /* The -1 here accounts for the LSB in the radix tree */
+            img_info[nr_vdis].capacity = 
+                    ((1LL << (VDI_HEIGHT-1)) * SECTS_PER_NODE);
+            nr_vdis++;
+            vdi = vdi->next;
+        }
+    }
+
+    
+    rsp = (blkif_response_t *)req;
+    rsp->id = req->id;
+    rsp->operation = BLKIF_OP_PROBE;
+    rsp->status = nr_vdis; /* number of disks */
+
+    DPRINTF("parallax_probe: send positive response (nr_vdis=%d)\n", nr_vdis);
+    return  BLKTAP_RESPOND;
+err:
+    rsp = (blkif_response_t *)req;
+    rsp->id = req->id;
+    rsp->operation = BLKIF_OP_PROBE;
+    rsp->status = BLKIF_RSP_ERROR;
+    
+    DPRINTF("parallax_probe: send error response\n"); 
+    return BLKTAP_RESPOND;  
+}
+
+typedef struct {
+    blkif_request_t *req;
+    int              count;
+    int              error;
+    pthread_mutex_t  mutex;
+} pending_t;
+
+#define MAX_REQUESTS 64
+pending_t pending_list[MAX_REQUESTS];
+
+struct cb_param {
+    pending_t *pent;
+    int       segment;
+    u64       sector; 
+    u64       vblock; /* for debug printing -- can be removed. */
+};
+
+static void read_cb(struct io_ret r, void *in_param)
+{
+    struct cb_param *param = (struct cb_param *)in_param;
+    pending_t *p = param->pent;
+    int segment = param->segment;
+    blkif_request_t *req = p->req;
+    unsigned long size, offset, start;
+    char *dpage, *spage;
+       
+    spage  = IO_BLOCK(r);
+    if (spage == NULL) { p->error++; goto finish; }
+    dpage  = (char *)MMAP_VADDR(ID_TO_IDX(req->id), segment);
+    
+    /* Calculate read size and offset within the read block. */
+
+    offset = (param->sector << SECTOR_SHIFT) % BLOCK_SIZE;
+    size = ( blkif_last_sect (req->frame_and_sects[segment]) -
+             blkif_first_sect(req->frame_and_sects[segment]) + 1
+        ) << SECTOR_SHIFT;
+    start = blkif_first_sect(req->frame_and_sects[segment]) 
+        << SECTOR_SHIFT;
+
+    DPRINTF("ParallaxRead: sect: %lld (%ld,%ld),  "
+            "vblock %llx, "
+            "size %lx\n", 
+            param->sector, blkif_first_sect(p->req->frame_and_sects[segment]),
+            blkif_last_sect (p->req->frame_and_sects[segment]),
+            param->vblock, size); 
+
+    memcpy(dpage + start, spage + offset, size);
+    freeblock(spage);
+    
+    /* Done the read.  Now update the pending record. */
+ finish:
+    pthread_mutex_lock(&p->mutex);
+    p->count--;
+    
+    if (p->count == 0) {
+       blkif_response_t *rsp;
+       
+        rsp = (blkif_response_t *)req;
+        rsp->id = req->id;
+        rsp->operation = BLKIF_OP_READ;
+       if (p->error == 0) {
+            rsp->status = BLKIF_RSP_OKAY;
+       } else {
+            rsp->status = BLKIF_RSP_ERROR;
+       }
+        blktap_inject_response(rsp);       
+    }
+    
+    pthread_mutex_unlock(&p->mutex);
+       
+    free(param); /* TODO: replace with cached alloc/dealloc */
+}      
+
+int parallax_read(blkif_request_t *req, blkif_t *blkif)
+{
+    blkif_response_t *rsp;
+    u64 vblock, gblock;
+    vdi_t *vdi;
+    u64 sector;
+    int i;
+    char *dpage, *spage;
+    pending_t *pent;
+
+    vdi = blkif_get_vdi(blkif, req->device);
+    
+    if ( vdi == NULL )
+        goto err;
+        
+    pent = &pending_list[ID_TO_IDX(req->id)];
+    pent->count = req->nr_segments;
+    pent->req = req;
+    pthread_mutex_init(&pent->mutex, NULL);
+    
+    for (i = 0; i < req->nr_segments; i++) {
+        pthread_t tid;
+        int ret;
+        struct cb_param *p;
+        
+        /* Round the requested segment to a block address. */
+        sector  = req->sector_number + (8*i);
+        vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT;
+        
+        /* TODO: Replace this call to malloc with a cached allocation */
+        p = (struct cb_param *)malloc(sizeof(struct cb_param));
+        p->pent = pent;
+        p->sector = sector; 
+        p->segment = i;     
+        p->vblock = vblock; /* dbg */
+        
+        /* Get that block from the store. */
+        vdi_read(vdi, vblock, read_cb, (void *)p);    
+    }
+    
+    return BLKTAP_STOLEN;
+
+err:
+    rsp = (blkif_response_t *)req;
+    rsp->id = req->id;
+    rsp->operation = BLKIF_OP_READ;
+    rsp->status = BLKIF_RSP_ERROR;
+    
+    return BLKTAP_RESPOND;  
+}
+
+static void write_cb(struct io_ret r, void *in_param)
+{
+    struct cb_param *param = (struct cb_param *)in_param;
+    pending_t *p = param->pent;
+    blkif_request_t *req = p->req;
+    
+    /* catch errors from the block code. */
+    if (IO_INT(r) < 0) p->error++;
+    
+    pthread_mutex_lock(&p->mutex);
+    p->count--;
+    
+    if (p->count == 0) {
+       blkif_response_t *rsp;
+       
+        rsp = (blkif_response_t *)req;
+        rsp->id = req->id;
+        rsp->operation = BLKIF_OP_WRITE;
+       if (p->error == 0) {
+            rsp->status = BLKIF_RSP_OKAY;
+       } else {
+            rsp->status = BLKIF_RSP_ERROR;
+       }
+        blktap_inject_response(rsp);       
+    }
+    
+    pthread_mutex_unlock(&p->mutex);
+       
+    free(param); /* TODO: replace with cached alloc/dealloc */
+}
+
+int parallax_write(blkif_request_t *req, blkif_t *blkif)
+{
+    blkif_response_t *rsp;
+    u64 sector;
+    int i, writable = 0;
+    u64 vblock, gblock;
+    char *spage;
+    unsigned long size, offset, start;
+    vdi_t *vdi;
+    pending_t *pent;
+
+    vdi = blkif_get_vdi(blkif, req->device);
+    
+    if ( vdi == NULL )
+        goto err;
+        
+    pent = &pending_list[ID_TO_IDX(req->id)];
+    pent->count = req->nr_segments;
+    pent->req = req;
+    pthread_mutex_init(&pent->mutex, NULL);
+    
+    for (i = 0; i < req->nr_segments; i++) {
+        struct cb_param *p;
+        
+        spage  = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
+        
+        /* Round the requested segment to a block address. */
+        
+        sector  = req->sector_number + (8*i);
+        vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT;
+        
+        /* Calculate read size and offset within the read block. */
+        
+        offset = (sector << SECTOR_SHIFT) % BLOCK_SIZE;
+        size = ( blkif_last_sect (req->frame_and_sects[i]) -
+                 blkif_first_sect(req->frame_and_sects[i]) + 1
+            ) << SECTOR_SHIFT;
+        start = blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT;
+
+        DPRINTF("ParallaxWrite: sect: %lld (%ld,%ld),  "
+                "vblock %llx, gblock %llx, "
+                "size %lx\n", 
+                sector, blkif_first_sect(req->frame_and_sects[i]),
+                blkif_last_sect (req->frame_and_sects[i]),
+                vblock, gblock, size); 
+      
+        /* XXX: For now we just freak out if they try to write a   */
+        /* non block-sized, block-aligned page.                    */
+        
+        if ((offset != 0) || (size != BLOCK_SIZE) || (start != 0)) {
+            printf("]\n] STRANGE WRITE!\n]\n");
+            goto err;
+        }
+        
+        /* TODO: Replace this call to malloc with a cached allocation */
+        p = (struct cb_param *)malloc(sizeof(struct cb_param));
+        p->pent = pent;
+        p->sector = sector; 
+        p->segment = i;     
+        p->vblock = vblock; /* dbg */
+        
+        /* Issue the write to the store. */
+        vdi_write(vdi, vblock, spage, write_cb, (void *)p);
+    }
+
+    return BLKTAP_STOLEN;
+
+err:
+    rsp = (blkif_response_t *)req;
+    rsp->id = req->id;
+    rsp->operation = BLKIF_OP_WRITE;
+    rsp->status = BLKIF_RSP_ERROR;
+    
+    return BLKTAP_RESPOND;  
+}
+
+int parallax_request(blkif_request_t *req)
+{
+    blkif_response_t *rsp;
+    domid_t  dom   = ID_TO_DOM(req->id);
+    blkif_t *blkif = blkif_find_by_handle(dom, 0);
+    
+    if (blkif == NULL)
+        goto err;
+    
+    if ( req->operation == BLKIF_OP_PROBE ) {
+        
+        return parallax_probe(req, blkif);
+        
+    } else if ( req->operation == BLKIF_OP_READ ) {
+        
+        return parallax_read(req, blkif);
+        
+    } else if ( req->operation == BLKIF_OP_WRITE ) {
+        
+        return parallax_write(req, blkif);
+        
+    } else {
+        printf("Unknown request message type!\n");
+        /* Unknown operation */
+        goto err;
+    }
+    
+err:
+    rsp = (blkif_response_t *)req;
+    rsp->operation = req->operation;
+    rsp->id = req->id;
+    rsp->status = BLKIF_RSP_ERROR;
+    return BLKTAP_RESPOND;  
+}
+
+void __init_parallax(void) 
+{
+    memset(blkif_hash, 0, sizeof(blkif_hash));
+}
+
+
+
+int main(int argc, char *argv[])
+{
+    DPRINTF("parallax: starting.\n"); 
+    __init_blockstore();
+    DPRINTF("parallax: initialized blockstore...\n"); 
+    init_block_async();
+    DPRINTF("parallax: initialized async blocks...\n"); 
+    __init_vdi();
+    DPRINTF("parallax: initialized vdi registry etc...\n"); 
+    __init_parallax();
+    DPRINTF("parallax: initialized local stuff..\n"); 
+
+    blktap_register_ctrl_hook("parallax_control", parallax_control);
+    blktap_register_request_hook("parallax_request", parallax_request);
+    DPRINTF("parallax: added ctrl + request hooks, starting listen...\n"); 
+    blktap_listen();
+    
+    return 0;
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/vdi.c       Sun Jul  3 22:36:48 2005
@@ -0,0 +1,367 @@
+/**************************************************************************
+ * 
+ * vdi.c
+ *
+ * Virtual Disk Image (VDI) Interfaces
+ *
+ */
+ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/time.h>
+#include <pthread.h>
+#include "blockstore.h"
+#include "block-async.h"
+#include "requests-async.h"
+#include "radix.h"
+#include "vdi.h"
+                    
+#define VDI_REG_BLOCK   2LL
+#define VDI_RADIX_ROOT  writable(3)
+                                                            
+#if 0
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+/* I haven't decided about this registry stuff, so this is just a really
+ * quick lash-up so that there is some way to track VDIs.
+ *
+ * (Most vdi access should be with a direct handle to the block, so this
+ *  registry is just for start-of-day lookup and other control operations.)
+ */
+
+vdi_registry_t *create_vdi_registry(void)
+{
+    vdi_registry_t *reg = (vdi_registry_t *)newblock();
+    
+    if (reg == NULL)
+        return NULL;
+    
+    /* zero-fill the vdi radix root while we have an empty block. */
+    writeblock(VDI_RADIX_ROOT, (void *)reg);
+    
+    
+    DPRINTF("[vdi.c] Creating VDI registry!\n");
+    reg->magic      = VDI_REG_MAGIC;
+    reg->nr_vdis    = 0;
+    
+    writeblock(VDI_REG_BLOCK, (void *)reg);
+    
+    return reg;
+}
+    
+vdi_registry_t *get_vdi_registry(void)
+{
+    vdi_registry_t *vdi_reg = (vdi_registry_t *)readblock(VDI_REG_BLOCK);
+    
+    if ( vdi_reg == NULL )
+        vdi_reg = create_vdi_registry();
+    
+    if ( vdi_reg->magic != VDI_REG_MAGIC ) {
+        freeblock(vdi_reg);
+        return NULL;
+    }
+    
+    return vdi_reg;
+}
+
+
+vdi_t *vdi_create(snap_id_t *parent_snap, char *name)
+{
+    int ret;
+    vdi_t *vdi;
+    vdi_registry_t *vdi_reg;
+    snap_rec_t snap_rec;
+    
+    /* create a vdi struct */
+    vdi = newblock();
+    if (vdi == NULL) 
+        return NULL;
+    
+    if ( snap_get_id(parent_snap, &snap_rec) == 0 ) {
+        vdi->radix_root = snapshot(snap_rec.radix_root);
+    } else {
+        vdi->radix_root = allocblock((void *)vdi); /* vdi is just zeros here */
+        vdi->radix_root = writable(vdi->radix_root); /* grr. */
+    }
+    
+    /* create a snapshot log, and add it to the vdi struct */
+    
+    ret = snap_block_create(parent_snap, &vdi->snap);
+    if ( ret != 0 ) {
+        DPRINTF("Error getting snap block in vdi_create.\n");
+        freeblock(vdi);
+        return NULL;
+    }
+            
+    /* append the vdi to the registry, fill block and id.             */
+    /* implicit allocation means we have to write the vdi twice here. */
+    vdi_reg    = get_vdi_registry();
+    if ( vdi_reg == NULL ) {
+        freeblock(vdi);
+        return NULL;
+    }
+    
+    vdi->block = allocblock((void *)vdi);
+    vdi->id    = vdi_reg->nr_vdis++;
+    strncpy(vdi->name, name, VDI_NAME_SZ);
+    vdi->name[VDI_NAME_SZ] = '\0';
+    vdi->radix_lock = NULL; /* for tidiness */
+    writeblock(vdi->block, (void *)vdi);
+    
+    update(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi->id, vdi->block);
+    writeblock(VDI_REG_BLOCK, (void *)vdi_reg);
+    freeblock(vdi_reg);
+    
+    vdi->radix_lock = (struct radix_lock *)malloc(sizeof(struct radix_lock));
+    if (vdi->radix_lock == NULL) 
+    {
+       perror("couldn't malloc radix_lock for new vdi!");
+       freeblock(vdi);
+       return NULL;
+    }
+    radix_lock_init(vdi->radix_lock);
+    
+    return vdi;
+}
+
+/* vdi_get and vdi_put currently act more like alloc/free -- they don't 
+ * do refcount-based allocation.  
+ */
+vdi_t *vdi_get(u64 vdi_id)
+{
+    u64 vdi_blk;
+    vdi_t *vdi;
+    
+    vdi_blk = lookup(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi_id);
+    
+    if ( vdi_blk == 0 )
+        return NULL;
+    
+    vdi = (vdi_t *)readblock(vdi_blk);
+    
+    vdi->radix_lock = (struct radix_lock *)malloc(sizeof(struct radix_lock));
+    if (vdi->radix_lock == NULL) 
+    {
+       perror("couldn't malloc radix_lock for new vdi!");
+       freeblock(vdi);
+       return NULL;
+    }
+    radix_lock_init(vdi->radix_lock);
+    
+    return vdi;
+}
+
+void vdi_put(vdi_t *vdi)
+{
+    free(vdi->radix_lock);
+    freeblock(vdi);
+}
+
+void vdi_snapshot(vdi_t *vdi)
+{
+    snap_rec_t rec;
+    int ret;
+    
+    rec.radix_root = vdi->radix_root;
+    gettimeofday(&rec.timestamp, NULL);
+    rec.deleted = 0;
+    
+    vdi->radix_root = snapshot(vdi->radix_root);
+    ret = snap_append(&vdi->snap, &rec, &vdi->snap);
+    if ( ret != 0 ) {
+        printf("snap_append returned failure\n");
+        return;
+    }
+    writeblock(vdi->block, vdi);
+}
+    
+int __init_vdi()
+{
+    /* sneak this in here for the moment. */
+    __rcache_init();
+    
+    /* force the registry to be created if it doesn't exist. */
+    vdi_registry_t *vdi_reg = get_vdi_registry();
+    if (vdi_reg == NULL) {
+        printf("[vdi.c] Couldn't get/create a VDI registry!\n");
+        return -1;
+    }
+    freeblock(vdi_reg);
+    
+    
+    return 0;
+}
+    
+#ifdef VDI_STANDALONE
+
+#define TEST_VDIS      50
+#define NR_ITERS    50000
+#define FORK_POINTS   200
+#define INIT_VDIS       3
+#define INIT_SNAPS     40
+
+/* These must be of decreasing size: */
+#define NEW_FORK       (RAND_MAX-(RAND_MAX/1000))
+#define NEW_ROOT_VDI   (RAND_MAX-((RAND_MAX/1000)*2))
+#define NEW_FORK_VDI   (RAND_MAX-((RAND_MAX/1000)*3))
+
+#define GRAPH_DOT_FILE "vdi.dot"
+#define GRAPH_PS_FILE  "vdi.ps"
+
+
+typedef struct sh_st {
+    snap_id_t     id;
+    struct sh_st *next;
+} sh_t;
+
+#define SNAP_HASHSZ 1024
+sh_t *node_hash[SNAP_HASHSZ];
+#define SNAP_HASH(_id) (((int)(_id)->block^(_id)->index)%SNAP_HASHSZ)
+
+#define SNAPID_EQUAL(_a,_b) \
+    (((_a)->block==(_b)->block) && ((_a)->index==(_b)->index))
+int sh_check_and_add(snap_id_t *id)
+{
+    sh_t **s = &node_hash[SNAP_HASH(id)];
+    
+    while (*s != NULL) {
+        if (SNAPID_EQUAL(&((*s)->id), id))
+            return 1;
+        *s = (*s)->next;
+    }
+    
+    *s = (sh_t *)malloc(sizeof(sh_t));
+    (*s)->id = *id;
+    (*s)->next = NULL;
+    
+    return 0;
+}
+
+int main(int argc, char *argv[])
+{
+    vdi_t *vdi_list[TEST_VDIS];
+    snap_id_t id, fork_points[FORK_POINTS];
+    int nr_vdis = 0, nr_forks = 0;
+    int i, j, r;
+    FILE *f;
+    char name[VDI_NAME_SZ];
+    
+    __init_blockstore();
+    __init_vdi();
+    
+    printf("[o] Generating seed VDIs. (%d VDIs)\n", INIT_VDIS);
+    
+    for (i=0; i<INIT_VDIS; i++) {
+        r=rand();
+        
+        sprintf(name, "VDI Number %d", nr_vdis);
+        vdi_list[i] = vdi_create(NULL, name);
+        for (j=0; j<(r%INIT_SNAPS); j++)
+            vdi_snapshot(vdi_list[i]);
+        fork_points[i] = vdi_list[i]->snap;
+        nr_vdis++;
+        nr_forks++;
+    }
+    
+    printf("[o] Running a random workload. (%d iterations)\n", NR_ITERS);
+            
+    for (i=0; i<NR_ITERS; i++) {
+        r = rand();
+        
+        if ( r > NEW_FORK ) {
+            if ( nr_forks > FORK_POINTS )
+                continue;
+            id = vdi_list[r%nr_vdis]->snap;
+            if ( ( id.block == 0 ) || ( id.index == 0 ) )
+                continue;
+            id.index--;
+            fork_points[nr_forks++] = id;
+            
+        } else if ( r > NEW_ROOT_VDI ) {
+            
+            if ( nr_vdis == TEST_VDIS )
+                continue;
+            
+            sprintf(name, "VDI Number %d.", nr_vdis);
+            vdi_list[nr_vdis++] = vdi_create(NULL, name);
+            
+        } else if ( r > NEW_FORK_VDI ) {
+            
+            if ( nr_vdis == TEST_VDIS )
+                continue;
+            
+            sprintf(name, "VDI Number %d.", nr_vdis);
+            vdi_list[nr_vdis++] = vdi_create(&fork_points[r%nr_forks], name);
+            
+        } else /* SNAPSHOT */ {
+            
+            vdi_snapshot(vdi_list[r%nr_vdis]);
+            
+        }
+    }
+    
+    /* now dump it out to a dot file. */
+    printf("[o] Dumping state to a dot graph. (%d VDIs)\n", nr_vdis);
+    
+    f = fopen(GRAPH_DOT_FILE, "w");
+    
+    /* write graph preamble */
+    fprintf(f, "digraph G {\n");
+    fprintf(f, "   rankdir=LR\n");
+    
+    for (i=0; i<nr_vdis; i++) {
+        char oldnode[255];
+        snap_block_t *blk;
+        snap_id_t id = vdi_list[i]->snap;
+        int nr_snaps, done=0;
+        
+        /* add a node for the id */
+printf("vdi: %d\n", i);
+        fprintf(f, "   n%Ld%d 
[color=blue,shape=box,label=\"%s\\nb:%Ld\\nidx:%d\"]\n", 
+                id.block, id.index, vdi_list[i]->name,
+                id.block, id.index);
+        sprintf(oldnode, "n%Ld%d", id.block, id.index);
+        
+        while (id.block != 0) {
+            blk = snap_get_block(id.block);
+            nr_snaps = blk->hdr.log_entries - (blk->hdr.nr_entries - id.index);
+            id = blk->hdr.fork_block;
+            
+            done = sh_check_and_add(&id);
+            
+            /* add a node for the fork_id */
+            if (!done) {
+                fprintf(f, "   n%Ld%d [shape=box,label=\"b:%Ld\\nidx:%d\"]\n", 
+                    id.block, id.index,
+                    id.block, id.index);
+            }
+            
+            /* add an edge between them */
+            fprintf(f, "   n%Ld%d -> %s [label=\"%u snapshots\"]\n",
+                    id.block, id.index, oldnode, nr_snaps);
+            sprintf(oldnode, "n%Ld%d", id.block, id.index);
+            freeblock(blk);
+            
+            if (done) break;
+        }
+    }
+    
+    /* write graph postamble */
+    fprintf(f, "}\n");
+    fclose(f);
+    
+    printf("[o] Generating postscript graph. (%s)\n", GRAPH_PS_FILE);
+    {
+        char cmd[255];
+        sprintf(cmd, "dot %s -Tps -o %s", GRAPH_DOT_FILE, GRAPH_PS_FILE);
+        system(cmd);
+    }
+    return 0;
+}
+
+#endif
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi.h
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/vdi.h       Sun Jul  3 22:36:48 2005
@@ -0,0 +1,55 @@
+#ifndef _VDI_H_
+#define _VDI_H_
+/**************************************************************************
+ * 
+ * vdi.h
+ *
+ * Virtual Disk Image (VDI) Interfaces
+ *
+ */
+
+#ifndef __VDI_H__
+#define __VDI_H__
+
+#include "blktaplib.h"
+#include "snaplog.h"
+
+#define VDI_HEIGHT     27 /* Note that these are now hard-coded */
+#define VDI_REG_HEIGHT 27 /* in the async lookup code           */
+
+#define VDI_NAME_SZ 256
+
+
+typedef struct vdi {
+    u64         id;               /* unique vdi id -- used by the registry   */
+    u64         block;            /* block where this vdi lives (also unique)*/
+    u64         radix_root;       /* radix root node for block mappings      */
+    snap_id_t   snap;             /* next snapshot slot for this VDI         */
+    struct vdi *next;             /* used to hash-chain in blkif.            */
+    blkif_vdev_t vdevice;         /* currently mounted as...                 */
+    struct radix_lock *radix_lock;/* per-line L1 RW lock for parallel reqs   */
+    char        name[VDI_NAME_SZ];/* human readable vdi name                 */
+} vdi_t;
+
+#define VDI_REG_MAGIC   0xff00ff0bb0ff00ffLL
+
+typedef struct vdi_registry {
+    u64     magic;
+    u64     nr_vdis;
+} vdi_registry_t;
+
+
+int __init_vdi(void);
+
+vdi_t *vdi_get(u64 vdi_id);
+void vdi_put(vdi_t *vdi);
+vdi_registry_t *get_vdi_registry(void);
+vdi_t *vdi_create(snap_id_t *parent_snap, char *name);
+u64 vdi_lookup_block(vdi_t *vdi, u64 vdi_block, int *writable);
+void vdi_update_block(vdi_t *vdi, u64 vdi_block, u64 g_block);
+void vdi_snapshot(vdi_t *vdi);
+
+
+#endif /* __VDI_H__ */
+
+#endif //_VDI_H_
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/requests-async.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/requests-async.c    Sun Jul  3 22:36:48 2005
@@ -0,0 +1,762 @@
+/* requests-async.c
+ *
+ * asynchronous request dispatcher for radix access in parallax.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <assert.h>
+#include <pthread.h>
+#include <err.h>
+#include <zlib.h> /* for crc32() */
+#include "requests-async.h"
+#include "vdi.h"
+#include "radix.h"
+
+#define L1_IDX(_a) (((_a) & 0x0000000007fc0000ULL) >> 18)
+#define L2_IDX(_a) (((_a) & 0x000000000003fe00ULL) >> 9)
+#define L3_IDX(_a) (((_a) & 0x00000000000001ffULL))
+
+
+#if 0
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+struct block_info {
+    u32        crc;
+    u32        unused;
+};
+
+struct io_req {
+    enum { IO_OP_READ, IO_OP_WRITE } op;
+    u64        root;
+    u64        vaddr;
+    int        state;
+    io_cb_t    cb;
+    void      *param;
+    struct radix_lock *lock;
+
+    /* internal stuff: */
+    struct io_ret     retval;/* holds the return while we unlock. */
+    char             *block; /* the block to write */
+    radix_tree_node   radix[3];
+    u64               radix_addr[3];
+    struct block_info bi;
+};
+
+void clear_w_bits(radix_tree_node node) 
+{
+    int i;
+    for (i=0; i<RADIX_TREE_MAP_ENTRIES; i++)
+        node[i] = node[i] & ONEMASK;
+    return;
+}
+
+void clear_L3_w_bits(radix_tree_node node) 
+{
+    int i;
+    for (i=0; i<RADIX_TREE_MAP_ENTRIES; i+=2)
+        node[i] = node[i] & ONEMASK;
+    return;
+}
+
+enum states {
+    /* both */
+    READ_L1,
+    READ_L2,
+    READ_L3,
+
+    /* read */
+    READ_LOCKED,
+    READ_DATA,
+    READ_UNLOCKED,
+    RETURN_ZERO,
+
+    /* write */
+    WRITE_LOCKED,
+    WRITE_DATA,
+    WRITE_L3,
+    WRITE_UNLOCKED,
+    
+    /* L3 Zero Path */
+    ALLOC_DATA_L3z,
+    WRITE_L3_L3z,
+    
+    /* L3 Fault Path */
+    ALLOC_DATA_L3f,
+    WRITE_L3_L3f,
+    
+    /* L2 Zero Path */
+    ALLOC_DATA_L2z,
+    WRITE_L2_L2z,
+    ALLOC_L3_L2z,
+    WRITE_L2_L3z,
+    
+    /* L2 Fault Path */
+    READ_L3_L2f,
+    ALLOC_DATA_L2f,
+    WRITE_L2_L2f,
+    ALLOC_L3_L2f,
+    WRITE_L2_L3f,
+
+    /* L1 Zero Path */
+    ALLOC_DATA_L1z,
+    ALLOC_L3_L1z,
+    ALLOC_L2_L1z,
+    WRITE_L1_L1z,
+
+    /* L1 Fault Path */
+    READ_L2_L1f,
+    READ_L3_L1f,
+    ALLOC_DATA_L1f,
+    ALLOC_L3_L1f,
+    ALLOC_L2_L1f,
+    WRITE_L1_L1f,
+    
+};
+
+enum radix_offsets {
+    L1 = 0, 
+    L2 = 1,
+    L3 = 2
+};
+
+
+static void read_cb(struct io_ret ret, void *param);
+static void write_cb(struct io_ret ret, void *param);
+
+int vdi_read(vdi_t *vdi, u64 vaddr, io_cb_t cb, void *param)
+{
+    struct io_req *req;
+
+    if (!VALID_VADDR(vaddr)) return ERR_BAD_VADDR;
+    /* Every second line in the bottom-level radix tree is used to      */
+    /* store crc32 values etc. We shift the vadder here to achied this. */
+    vaddr <<= 1;
+
+    req = (struct io_req *)malloc(sizeof (struct io_req));
+    if (req == NULL) return ERR_NOMEM;
+
+    req->radix[0] = req->radix[1] = req->radix[2] = NULL;      
+    req->op    = IO_OP_READ;
+    req->root  = vdi->radix_root;
+    req->lock  = vdi->radix_lock; 
+    req->vaddr = vaddr;
+    req->cb    = cb;
+    req->param = param;
+    req->state = READ_LOCKED;
+
+    block_rlock(req->lock, L1_IDX(vaddr), read_cb, req);
+       
+    return 0;
+}
+
+
+int   vdi_write(vdi_t *vdi, u64 vaddr, char *block, 
+                io_cb_t cb, void *param)
+{
+    struct io_req *req;
+
+    if (!VALID_VADDR(vaddr)) return ERR_BAD_VADDR;
+    /* Every second line in the bottom-level radix tree is used to      */
+    /* store crc32 values etc. We shift the vadder here to achied this. */
+    vaddr <<= 1;
+
+    req = (struct io_req *)malloc(sizeof (struct io_req));
+    if (req == NULL) return ERR_NOMEM; 
+
+    req->radix[0] = req->radix[1] = req->radix[2] = NULL;
+    req->op     = IO_OP_WRITE;
+    req->root   = vdi->radix_root;
+    req->lock   = vdi->radix_lock; 
+    req->vaddr  = vaddr;
+    req->block  = block;
+    /* Todo: add a pseodoheader to the block to include some location   */
+    /* information in the CRC as well.                                  */
+    req->bi.crc = (u32) crc32(0L, Z_NULL, 0); 
+    req->bi.crc = (u32) crc32(req->bi.crc, block, BLOCK_SIZE); 
+    req->bi.unused = 0xdeadbeef;
+
+    req->cb     = cb;
+    req->param  = param;
+    req->radix_addr[L1] = getid(req->root); /* for consistency */
+    req->state  = WRITE_LOCKED;
+
+    block_wlock(req->lock, L1_IDX(vaddr), write_cb, req);
+
+
+    return 0;
+}
+
+static void read_cb(struct io_ret ret, void *param)
+{
+    struct io_req *req = (struct io_req *)param;
+    radix_tree_node node;
+    u64 idx;
+    char *block;
+    void *req_param;
+
+    DPRINTF("read_cb\n");
+    /* get record */
+    switch(req->state) {
+       
+    case READ_LOCKED: 
+    
+        DPRINTF("READ_LOCKED\n");
+       req->state = READ_L1;
+       block_read(getid(req->root), read_cb, req); 
+       break;
+       
+    case READ_L1: /* block is the radix root */
+
+        DPRINTF("READ_L1\n");
+        block = IO_BLOCK(ret);
+        if (block == NULL) goto fail;
+        node = (radix_tree_node) block;
+        idx  = getid( node[L1_IDX(req->vaddr)] );
+        free(block);
+        if ( idx == ZERO ) {
+            req->state = RETURN_ZERO;
+            block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
+        } else {
+            req->state = READ_L2;
+            block_read(idx, read_cb, req);
+        }
+        break;
+
+    case READ_L2:
+
+        DPRINTF("READ_L2\n");
+        block = IO_BLOCK(ret);
+        if (block == NULL) goto fail;
+        node = (radix_tree_node) block;
+        idx  = getid( node[L2_IDX(req->vaddr)] );
+        free(block);
+        if ( idx == ZERO ) {
+            req->state = RETURN_ZERO;
+            block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
+        } else {
+            req->state = READ_L3;
+            block_read(idx, read_cb, req);
+        }
+        break;
+
+    case READ_L3:
+    {
+        struct block_info *bi;
+
+        DPRINTF("READ_L3\n");
+        block = IO_BLOCK(ret);
+        if (block == NULL) goto fail;
+        node = (radix_tree_node) block;
+        idx  = getid( node[L3_IDX(req->vaddr)] );
+        bi = (struct block_info *) &node[L3_IDX(req->vaddr) + 1];
+        req->bi = *bi;
+        free(block);
+        if ( idx == ZERO )  {
+            req->state = RETURN_ZERO;
+            block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
+        } else {
+            req->state = READ_DATA;
+            block_read(idx, read_cb, req);
+        }
+        break;
+    }
+    case READ_DATA:
+    {
+        u32 crc;
+
+        DPRINTF("READ_DATA\n");
+        block = IO_BLOCK(ret);
+        if (block == NULL) goto fail;
+
+        /* crc check */
+        crc = (u32) crc32(0L, Z_NULL, 0); 
+        crc = (u32) crc32(crc, block, BLOCK_SIZE); 
+        if (crc != req->bi.crc) {
+            /* TODO: add a retry loop here.                          */
+            /* Do this after the cache is added -- make sure to      */
+            /* invalidate the bad page before reissuing the read.    */
+
+            warn("Bad CRC on vaddr (%Lu:%d)\n", req->vaddr, req->bi.unused);
+#ifdef PRINT_BADCRC_PAGES
+            {
+                int j;
+                for (j=0; j<BLOCK_SIZE; j++) {
+                    if isprint(block[j]) {
+                        printf("%c", block[j]);
+                    } else {
+                        printf(".");
+                    }
+                    if ((j % 64) == 0) printf("\n");
+                }
+            }
+#endif /* PRINT_BADCRC_PAGES */
+
+            /* fast and loose for the moment. */
+            /* goto fail;                     */
+        }
+
+        req->retval = ret;
+        req->state = READ_UNLOCKED;
+        block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
+        break;
+    }
+    case READ_UNLOCKED:
+    {
+        struct io_ret r;
+        io_cb_t cb;
+        DPRINTF("READ_UNLOCKED\n");
+        req_param = req->param;
+        r         = req->retval;
+        cb        = req->cb;
+        free(req);
+        cb(r, req_param);
+        break;
+    }
+    
+    case RETURN_ZERO:
+    {
+        struct io_ret r;
+        io_cb_t cb;
+        DPRINTF("RETURN_ZERO\n");
+        req_param = req->param;
+        cb        = req->cb;
+        free(req);
+        r.type = IO_BLOCK_T;
+        r.u.b = newblock();
+        cb(r, req_param);
+        break;
+    }
+        
+    default:
+       DPRINTF("*** Write: Bad state! (%d) ***\n", req->state);
+       goto fail;
+    }
+ 
+    return;
+
+ fail:
+    {
+        struct io_ret r;
+        io_cb_t cb;
+        DPRINTF("asyn_read had a read error.\n");
+        req_param = req->param;
+        r         = ret;
+        cb        = req->cb;
+        free(req);
+        cb(r, req_param);
+    }
+
+
+}
+
+static void write_cb(struct io_ret r, void *param)
+{
+    struct io_req *req = (struct io_req *)param;
+    radix_tree_node node;
+    u64 a, addr;
+    void *req_param;
+    struct block_info *bi;
+
+    switch(req->state) {
+       
+    case WRITE_LOCKED:
+        
+        DPRINTF("WRITE_LOCKED (%llu)\n", L1_IDX(req->vaddr));
+       req->state = READ_L1;
+       block_read(getid(req->root), write_cb, req); 
+       break;
+       
+    case READ_L1: /* block is the radix root */
+
+        DPRINTF("READ_L1\n");
+        node = (radix_tree_node) IO_BLOCK(r);
+        if (node == NULL) goto fail;
+        a    = node[L1_IDX(req->vaddr)];
+        addr = getid(a);
+
+        req->radix_addr[L2] = addr;
+        req->radix[L1] = node;
+
+        if ( addr == ZERO ) {
+            /* L1 empty subtree: */
+            req->state = ALLOC_DATA_L1z;
+            block_alloc( req->block, write_cb, req );
+        } else if ( !iswritable(a) ) {
+            /* L1 fault: */
+            req->state = READ_L2_L1f;
+            block_read( addr, write_cb, req );
+        } else {
+            req->state = READ_L2;
+            block_read( addr, write_cb, req );
+        }
+        break;
+    
+    case READ_L2:
+
+        DPRINTF("READ_L2\n");
+        node = (radix_tree_node) IO_BLOCK(r);
+        if (node == NULL) goto fail;
+        a    = node[L2_IDX(req->vaddr)];
+        addr = getid(a);
+
+        req->radix_addr[L3] = addr;
+        req->radix[L2] = node;
+
+        if ( addr == ZERO ) {
+            /* L2 empty subtree: */
+            req->state = ALLOC_DATA_L2z;
+            block_alloc( req->block, write_cb, req );
+        } else if ( !iswritable(a) ) {
+            /* L2 fault: */
+            req->state = READ_L3_L2f;
+            block_read( addr, write_cb, req );
+        } else {
+            req->state = READ_L3;
+            block_read( addr, write_cb, req );
+        }
+        break;
+    
+    case READ_L3:
+
+        DPRINTF("READ_L3\n");
+        node = (radix_tree_node) IO_BLOCK(r);
+        if (node == NULL) goto fail;
+        a    = node[L3_IDX(req->vaddr)];
+        addr = getid(a);
+
+        req->radix[L3] = node;
+
+        if ( addr == ZERO ) {
+            /* L3 fault: */
+            req->state = ALLOC_DATA_L3z;
+            block_alloc( req->block, write_cb, req );
+        } else if ( !iswritable(a) ) {
+            /* L3 fault: */
+            req->state = ALLOC_DATA_L3f;
+            block_alloc( req->block, write_cb, req );
+        } else {
+            req->state = WRITE_DATA;
+            block_write( addr, req->block, write_cb, req );
+        }
+        break;
+    
+    case WRITE_DATA:
+
+        DPRINTF("WRITE_DATA\n");
+        /* The L3 radix points to the correct block, we just need to  */
+        /* update the crc.                                            */
+        if (IO_INT(r) < 0) goto fail;
+        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
+        req->bi.unused = 101;
+        *bi = req->bi;
+        req->state = WRITE_L3;
+        block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req);
+        break;
+    
+    /* L3 Zero Path: */
+
+    case ALLOC_DATA_L3z:
+
+        DPRINTF("ALLOC_DATA_L3z\n");
+        addr = IO_ADDR(r);
+        a = writable(addr);
+        req->radix[L3][L3_IDX(req->vaddr)] = a;
+        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
+        req->bi.unused = 102;
+        *bi = req->bi;
+        req->state = WRITE_L3_L3z;
+        block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req);
+        break;
+    
+    /* L3 Fault Path: */
+
+    case ALLOC_DATA_L3f:
+    
+        DPRINTF("ALLOC_DATA_L3f\n");
+        addr = IO_ADDR(r);
+        a = writable(addr);
+        req->radix[L3][L3_IDX(req->vaddr)] = a;
+        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
+        req->bi.unused = 103;
+        *bi = req->bi;
+        req->state = WRITE_L3_L3f;
+        block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req);
+        break;
+
+    /* L2 Zero Path: */
+        
+    case ALLOC_DATA_L2z:
+
+        DPRINTF("ALLOC_DATA_L2z\n");
+        addr = IO_ADDR(r);
+        a = writable(addr);
+        req->radix[L3] = newblock();
+        req->radix[L3][L3_IDX(req->vaddr)] = a;
+        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
+        req->bi.unused = 104;
+        *bi = req->bi;
+        req->state = ALLOC_L3_L2z;
+        block_alloc( (char*)req->radix[L3], write_cb, req );
+        break;
+
+    case ALLOC_L3_L2z:
+
+        DPRINTF("ALLOC_L3_L2z\n");
+        addr = IO_ADDR(r);
+        a = writable(addr);
+        req->radix[L2][L2_IDX(req->vaddr)] = a;
+        req->state = WRITE_L2_L2z;
+        block_write(req->radix_addr[L2], (char*)req->radix[L2], write_cb, req);
+        break;
+        
+    /* L2 Fault Path: */
+        
+    case READ_L3_L2f:
+    
+       DPRINTF("READ_L3_L2f\n");
+        node = (radix_tree_node) IO_BLOCK(r);
+        clear_L3_w_bits(node);
+        if (node == NULL) goto fail;
+        a    = node[L2_IDX(req->vaddr)];
+        addr = getid(a);
+
+        req->radix[L3] = node;
+        req->state = ALLOC_DATA_L2f;
+        block_alloc( req->block, write_cb, req );
+        break;
+                
+    case ALLOC_DATA_L2f:
+
+        DPRINTF("ALLOC_DATA_L2f\n");
+        addr = IO_ADDR(r);
+        a = writable(addr);
+        req->radix[L3][L3_IDX(req->vaddr)] = a;
+        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
+        req->bi.unused = 105;
+        *bi = req->bi;
+        req->state = ALLOC_L3_L2f;
+        block_alloc( (char*)req->radix[L3], write_cb, req );
+        break;
+
+    case ALLOC_L3_L2f:
+
+        DPRINTF("ALLOC_L3_L2f\n");
+        addr = IO_ADDR(r);
+        a = writable(addr);
+        req->radix[L2][L2_IDX(req->vaddr)] = a;
+        req->state = WRITE_L2_L2f;
+        block_write(req->radix_addr[L2], (char*)req->radix[L2], write_cb, req);
+        break;
+        
+    /* L1 Zero Path: */
+    
+    case ALLOC_DATA_L1z:
+
+        DPRINTF("ALLOC_DATA_L1z\n");
+        addr = IO_ADDR(r);
+        a = writable(addr);
+        req->radix[L3] = newblock();
+        req->radix[L3][L3_IDX(req->vaddr)] = a;
+        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
+        req->bi.unused = 106;
+        *bi = req->bi;
+        req->state = ALLOC_L3_L1z;
+        block_alloc( (char*)req->radix[L3], write_cb, req );
+        break;
+        
+    case ALLOC_L3_L1z:
+
+        DPRINTF("ALLOC_L3_L1z\n");
+        addr = IO_ADDR(r);
+        a = writable(addr);
+        req->radix[L2] = newblock();
+        req->radix[L2][L2_IDX(req->vaddr)] = a;
+        req->state = ALLOC_L2_L1z;
+        block_alloc( (char*)req->radix[L2], write_cb, req );
+        break;
+
+    case ALLOC_L2_L1z:
+
+        DPRINTF("ALLOC_L2_L1z\n");
+        addr = IO_ADDR(r);
+        a = writable(addr);
+        req->radix[L1][L1_IDX(req->vaddr)] = a;
+        req->state = WRITE_L1_L1z;
+        block_write(req->radix_addr[L1], (char*)req->radix[L1], write_cb, req);
+        break;
+
+    /* L1 Fault Path: */
+        
+    case READ_L2_L1f:
+    
+       DPRINTF("READ_L2_L1f\n");
+        node = (radix_tree_node) IO_BLOCK(r);
+        clear_w_bits(node);
+        if (node == NULL) goto fail;
+        a    = node[L2_IDX(req->vaddr)];
+        addr = getid(a);
+
+        req->radix_addr[L3] = addr;
+        req->radix[L2] = node;
+        
+        if (addr == ZERO) {
+            /* nothing below L2, create an empty L3 and alloc data. */
+            /* (So skip READ_L3_L1f.) */
+            req->radix[L3] = newblock();
+            req->state = ALLOC_DATA_L1f;
+            block_alloc( req->block, write_cb, req );
+        } else {
+            req->state = READ_L3_L1f;
+            block_read( addr, write_cb, req );
+        }
+        break;
+        
+    case READ_L3_L1f:
+    
+       DPRINTF("READ_L3_L1f\n");
+        node = (radix_tree_node) IO_BLOCK(r);
+        clear_L3_w_bits(node);
+        if (node == NULL) goto fail;
+        a    = node[L2_IDX(req->vaddr)];
+        addr = getid(a);
+
+        req->radix[L3] = node;
+        req->state = ALLOC_DATA_L1f;
+        block_alloc( req->block, write_cb, req );
+        break;
+                
+    case ALLOC_DATA_L1f:
+
+        DPRINTF("ALLOC_DATA_L1f\n");
+        addr = IO_ADDR(r);
+        a = writable(addr);
+        req->radix[L3][L3_IDX(req->vaddr)] = a;
+        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
+        req->bi.unused = 107;
+        *bi = req->bi;
+        req->state = ALLOC_L3_L1f;
+        block_alloc( (char*)req->radix[L3], write_cb, req );
+        break;
+
+    case ALLOC_L3_L1f:
+
+        DPRINTF("ALLOC_L3_L1f\n");
+        addr = IO_ADDR(r);
+        a = writable(addr);
+        req->radix[L2][L2_IDX(req->vaddr)] = a;
+        req->state = ALLOC_L2_L1f;
+        block_alloc( (char*)req->radix[L2], write_cb, req );
+        break;
+
+    case ALLOC_L2_L1f:
+
+        DPRINTF("ALLOC_L2_L1f\n");
+        addr = IO_ADDR(r);
+        a = writable(addr);
+        req->radix[L1][L1_IDX(req->vaddr)] = a;
+        req->state = WRITE_L1_L1f;
+        block_write(req->radix_addr[L1], (char*)req->radix[L1], write_cb, req);
+        break;
+
+    case WRITE_L3:
+    case WRITE_L3_L3z:
+    case WRITE_L3_L3f:
+    case WRITE_L2_L2z:
+    case WRITE_L2_L2f:
+    case WRITE_L1_L1z:
+    case WRITE_L1_L1f:
+    {
+       int i;
+        DPRINTF("DONE\n");
+        /* free any saved node vals. */
+        for (i=0; i<3; i++)
+            if (req->radix[i] != 0) free(req->radix[i]);
+        req->retval = r;
+        req->state = WRITE_UNLOCKED;
+        block_wunlock(req->lock, L1_IDX(req->vaddr), write_cb, req);
+        break;
+    }
+    case WRITE_UNLOCKED:
+    {
+        struct io_ret r;
+        io_cb_t cb;
+        DPRINTF("WRITE_UNLOCKED!\n");
+        req_param = req->param;
+        r         = req->retval;
+        cb        = req->cb;
+        free(req);
+        cb(r, req_param);
+        break;
+    }
+        
+    default:
+       DPRINTF("*** Write: Bad state! (%d) ***\n", req->state);
+       goto fail;
+    }
+    
+    return;
+    
+ fail:
+    {
+        struct io_ret r;
+        io_cb_t cb;
+        int i;
+
+        DPRINTF("asyn_write had a read error mid-way.\n");
+        req_param = req->param;
+        cb        = req->cb;
+        r.type = IO_INT_T;
+        r.u.i  = -1;
+        /* free any saved node vals. */
+        for (i=0; i<3; i++)
+            if (req->radix[i] != 0) free(req->radix[i]);
+        free(req);
+        cb(r, req_param);
+    }
+}
+
+char *vdi_read_s(vdi_t *vdi, u64 vaddr)
+{
+    pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
+    char *block = NULL;
+    int ret;
+
+    void reads_cb(struct io_ret r, void *param) 
+    {
+        block = IO_BLOCK(r);
+        pthread_mutex_unlock((pthread_mutex_t *)param);
+    }
+
+    pthread_mutex_lock(&m);
+    ret = vdi_read(vdi, vaddr, reads_cb, &m);
+
+    if (ret == 0) pthread_mutex_lock(&m);
+    
+    return block;
+}
+
+
+int vdi_write_s(vdi_t *vdi, u64 vaddr, char *block)
+{
+    pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
+    int ret, result;
+
+    void writes_cb(struct io_ret r, void *param) 
+    {
+        result = IO_INT(r);
+        pthread_mutex_unlock((pthread_mutex_t *)param);
+    }
+
+    pthread_mutex_lock(&m);
+    ret = vdi_write(vdi, vaddr, block, writes_cb, &m);
+
+    if (ret == 0) pthread_mutex_lock(&m);
+    
+    return result;
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/requests-async.h
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/requests-async.h    Sun Jul  3 22:36:48 2005
@@ -0,0 +1,29 @@
+#ifndef _REQUESTSASYNC_H_
+#define _REQUESTSASYNC_H_
+
+#include "block-async.h"
+#include "blockstore.h" /* for newblock etc. */
+
+/*
+#define BLOCK_SIZE 4096
+#define ZERO 0ULL
+#define getid(x) (((x)>>1)&0x7fffffffffffffffLLU)
+#define iswritable(x) (((x) & 1LLU) != 0)
+#define writable(x) (((x) << 1) | 1LLU)
+#define readonly(x) ((u64)((x) << 1))
+*/
+
+#define VADDR_MASK 0x0000000003ffffffLLU /* 26-bits = 256Gig */
+#define VALID_VADDR(x) (((x) & VADDR_MASK) == (x))
+
+int vdi_read (vdi_t *vdi, u64 vaddr, io_cb_t cb, void *param);
+int vdi_write(vdi_t *vdi, u64 vaddr, char *block, io_cb_t cb, void *param);
+             
+/* synchronous versions: */
+char *vdi_read_s (vdi_t *vdi, u64 vaddr);
+int   vdi_write_s(vdi_t *vdi, u64 vaddr, char *block);
+
+#define ERR_BAD_VADDR  -1
+#define ERR_NOMEM      -2
+
+#endif //_REQUESTSASYNC_H_
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_unittest.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/vdi_unittest.c      Sun Jul  3 22:36:48 2005
@@ -0,0 +1,184 @@
+/**************************************************************************
+ * 
+ * vdi_unittest.c
+ *
+ * Run a small test workload to ensure that data access through a vdi
+ * is (at least superficially) correct.
+ *
+ */
+ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "requests-async.h"
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+#define TEST_PAGES  32
+static char *zero_page;
+static char pages[TEST_PAGES][BLOCK_SIZE];
+static int next_page = 0;
+
+void fill_test_pages(void)
+{
+    int i, j;
+    long *page;
+
+    for (i=0; i< TEST_PAGES; i++) {
+        page = (unsigned long *)pages[i];
+        for (j=0; j<(BLOCK_SIZE/4); j++) {
+            page[j] = random();
+        }
+    }
+
+    zero_page = newblock();
+}
+
+inline u64 make_vaddr(u64 L1, u64 L2, u64 L3)
+{
+    u64 ret = L1;
+
+    ret = (ret << 9) | L2;
+    ret = (ret << 9) | L3;
+
+    return ret;
+}
+
+void touch_block(vdi_t *vdi, u64 L1, u64 L2, u64 L3)
+{
+    u64 vaddr;
+    char *page = pages[next_page++];
+    char *rpage = NULL;
+
+    printf("TOUCH (%3Lu, %3Lu, %3Lu)\n", L1, L2, L3);
+
+    vaddr = make_vaddr(L1, L2, L3);
+    vdi_write_s(vdi, vaddr, page);
+    rpage = vdi_read_s(vdi, vaddr);
+
+    if (rpage == NULL) 
+    {
+        printf( "read %Lu returned NULL\n", vaddr); 
+        return; 
+    }
+
+    if (memcmp(page, rpage, BLOCK_SIZE) != 0)
+    {
+        printf( "read %Lu returned a different page\n", vaddr);
+        return;
+    }
+
+    freeblock(rpage);
+}
+
+void test_block(vdi_t *vdi, u64 L1, u64 L2, u64 L3, char *page)
+{
+    u64 vaddr;
+    char *rpage = NULL;
+
+    printf("TEST  (%3Lu, %3Lu, %3Lu)\n", L1, L2, L3);
+
+    vaddr = make_vaddr(L1, L2, L3);
+    rpage = vdi_read_s(vdi, vaddr);
+
+    if (rpage == NULL) 
+    {
+        printf( "read %Lu returned NULL\n", vaddr); 
+        return; 
+    }
+
+    if (memcmp(page, rpage, BLOCK_SIZE) != 0)
+    {
+        printf( "read %Lu returned a different page\n", vaddr);
+        return;
+    }
+
+    freeblock(rpage);
+}
+
+void coverage_test(vdi_t *vdi)
+{
+    u64 vaddr;
+    int i, j, k;
+
+    /* Do a series of writes and reads to test all paths through the 
+     * async radix code.  The radix request code will dump CRC warnings
+     * if there are data problems here as well.
+     */
+
+    /* L1 Zero */
+    touch_block(vdi, 0, 0, 0);
+
+    /* L2 Zero */
+    i = next_page;
+    touch_block(vdi, 0, 1, 0);
+
+    /* L3 Zero */
+    j = next_page;
+    touch_block(vdi, 0, 0, 1);
+    k = next_page;
+    touch_block(vdi, 0, 1, 1);
+
+    /* Direct write */
+    touch_block(vdi, 0, 0, 0);
+
+    vdi_snapshot(vdi);
+
+    /* L1 fault */
+    touch_block(vdi, 0, 0, 0);
+    /* test the read-only branches that should have been copied over. */
+    test_block(vdi, 0, 1, 0, pages[i]);
+    test_block(vdi, 0, 0, 1, pages[j]);
+
+    /* L2 fault */
+    touch_block(vdi, 0, 1, 0);
+    test_block(vdi, 0, 1, 1, pages[k]);
+
+    /* L3 fault */
+    touch_block(vdi, 0, 0, 1);
+    
+    /* read - L1 zero */
+    test_block(vdi, 1, 0, 0, zero_page);
+    
+    /* read - L2 zero */
+    test_block(vdi, 0, 2, 0, zero_page);
+
+    /* read - L3 zero */
+    test_block(vdi, 0, 0, 2, zero_page);
+}
+
+int main(int argc, char *argv[])
+{
+    vdi_t       *vdi;
+    u64          id;
+    int          fd;
+    struct stat  st;
+    u64          tot_size;
+    char         spage[BLOCK_SIZE];
+    char        *dpage;
+    u64          vblock = 0, count=0;
+    
+    __init_blockstore();
+    init_block_async();
+    __init_vdi();
+        
+    vdi = vdi_create( NULL, "UNIT TEST VDI");
+    
+    if ( vdi == NULL ) {
+        printf("Failed to create VDI!\n");
+        freeblock(vdi);
+        exit(-1);
+    }
+
+    fill_test_pages();
+    coverage_test(vdi);
+    
+    freeblock(vdi);
+    
+    return (0);
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/block-async.h
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/block-async.h       Sun Jul  3 22:36:48 2005
@@ -0,0 +1,69 @@
+/* block-async.h
+ * 
+ * Asynchronous block wrappers for parallax.
+ */
+ 
+#ifndef _BLOCKASYNC_H_
+#define _BLOCKASYNC_H_
+
+#include <assert.h>
+#include <xc.h>
+#include "vdi.h"
+
+struct io_ret
+{
+    enum {IO_ADDR_T, IO_BLOCK_T, IO_INT_T} type;
+    union {
+        u64   a;
+        char *b;
+        int   i;
+    } u;
+};
+
+typedef void (*io_cb_t)(struct io_ret r, void *param);
+
+/* per-vdi lock structures to make sure requests run in a safe order. */
+struct radix_wait {
+    enum {RLOCK, WLOCK} type;
+    io_cb_t  cb;
+    void    *param;
+    struct radix_wait *next;
+};
+
+struct radix_lock {
+    pthread_mutex_t lock;
+    int                    lines[1024];
+    struct radix_wait     *waiters[1024];
+    enum {ANY, READ, STOP} state[1024];
+};
+void radix_lock_init(struct radix_lock *r);
+
+void block_read(u64 addr, io_cb_t cb, void *param);
+void block_write(u64 addr, char *block, io_cb_t cb, void *param);
+void block_alloc(char *block, io_cb_t cb, void *param);
+void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
+void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
+void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
+void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
+void init_block_async(void);
+
+static inline u64 IO_ADDR(struct io_ret r)
+{
+    assert(r.type == IO_ADDR_T);
+    return r.u.a;
+}
+
+static inline char *IO_BLOCK(struct io_ret r)
+{
+    assert(r.type == IO_BLOCK_T);
+    return r.u.b;
+}
+
+static inline int IO_INT(struct io_ret r)
+{
+    assert(r.type == IO_INT_T);
+    return r.u.i;
+}
+
+
+#endif //_BLOCKASYNC_H_
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_snap.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/vdi_snap.c  Sun Jul  3 22:36:48 2005
@@ -0,0 +1,43 @@
+/**************************************************************************
+ * 
+ * vdi_snap.c
+ *
+ * Snapshot a vdi.
+ *
+ */
+ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+int main(int argc, char *argv[])
+{
+    vdi_t  *vdi;
+    u64     id;
+    
+    __init_blockstore();
+    __init_vdi();
+    
+    if ( argc == 1 ) {
+        printf("usage: %s <VDI id>\n", argv[0]);
+        exit(-1);
+    }
+    
+    id = (u64) atoll(argv[1]);
+    
+    vdi = vdi_get(id);
+    
+    if ( vdi == NULL ) {
+        printf("couldn't find the requested VDI.\n");
+        freeblock(vdi);
+        exit(-1);
+    }
+    
+    vdi_snapshot(vdi);
+    
+    return 0;
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_create.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/vdi_create.c        Sun Jul  3 22:36:48 2005
@@ -0,0 +1,52 @@
+/**************************************************************************
+ * 
+ * vdi_create.c
+ *
+ * Create a new vdi.
+ *
+ */
+ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+int main(int argc, char *argv[])
+{
+    vdi_t       *vdi;
+    char         name[VDI_NAME_SZ] = "";
+    snap_id_t    id;
+    int          from_snap = 0;
+    
+    __init_blockstore();
+    __init_vdi();
+    
+    if ( argc == 1 ) {
+        printf("usage: %s <VDI Name> [<snap block> <snap idx>]\n", argv[0]);
+        exit(-1);
+    }
+    
+    strncpy( name, argv[1], VDI_NAME_SZ);
+    name[VDI_NAME_SZ] = '\0';    
+    
+    if ( argc > 3 ) {
+        id.block   = (u64)          atoll(argv[2]);
+        id.index   = (unsigned int) atol (argv[3]);
+        from_snap  = 1;
+    }
+    
+    vdi = vdi_create( from_snap ? &id : NULL, name);
+    
+    if ( vdi == NULL ) {
+        printf("Failed to create VDI!\n");
+        freeblock(vdi);
+        exit(-1);
+    }
+    
+    freeblock(vdi);
+    
+    return (0);
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_validate.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/vdi_validate.c      Sun Jul  3 22:36:48 2005
@@ -0,0 +1,97 @@
+/**************************************************************************
+ * 
+ * vdi_validate.c
+ *
+ * Intended to sanity-check vm_fill and the underlying vdi code.
+ *
+ * Block-by-block compare of a vdi with a file/device on the disk.
+ *
+ */
+ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+#include "requests-async.h"
+
+int main(int argc, char *argv[])
+{
+    vdi_t       *vdi;
+    u64          id;
+    int          fd;
+    struct stat  st;
+    u64          tot_size;
+    char         spage[BLOCK_SIZE], *dpage;
+    char        *vpage;
+    u64          vblock = 0, count=0;
+    
+    __init_blockstore();
+    init_block_async();
+    __init_vdi();
+    
+    if ( argc < 3 ) {
+        printf("usage: %s <VDI id> <filename>\n", argv[0]);
+        exit(-1);
+    }
+        
+    id = (u64) atoll(argv[1]);
+    
+    vdi = vdi_get( id );
+    
+    if ( vdi == NULL ) {
+        printf("Failed to retreive VDI %Ld!\n", id);
+        exit(-1);
+    }
+    
+    fd = open(argv[2], O_RDONLY | O_LARGEFILE);
+    
+    if (fd < 0) {
+        printf("Couldn't open %s!\n", argv[2]);
+        exit(-1);
+    }
+    
+    if ( fstat(fd, &st) != 0 ) {
+        printf("Couldn't stat %s!\n", argv[2]);
+        exit(-1);
+    }
+    
+    tot_size = (u64) st.st_size;
+    printf("Testing VDI %Ld (%Ld bytes).\n", id, tot_size);
+    
+    printf("           ");
+    while ( ( count = read(fd, spage, BLOCK_SIZE) ) > 0 ) {
+
+        dpage = vdi_read_s(vdi, vblock);
+
+        if (dpage == NULL) {
+            printf("\n\nfound an unmapped VDI block (%Ld)\n", vblock);
+            exit(0);
+        }
+
+        if (memcmp(spage, dpage, BLOCK_SIZE) != 0) {
+            printf("\n\nblocks don't match! (%Ld)\n", vblock);
+            exit(0);
+        }
+        
+        freeblock(dpage);
+        
+        vblock++;
+        if ((vblock % 1024) == 0) {
+            printf("\b\b\b\b\b\b\b\b\b\b\b%011Ld", vblock);
+            fflush(stdout);
+        }
+    }
+    printf("\n");
+    
+    printf("VDI %Ld looks good!\n", id);
+    
+    freeblock(vdi);
+    
+    return (0);
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_fill.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/vdi_fill.c  Sun Jul  3 22:36:48 2005
@@ -0,0 +1,81 @@
+/**************************************************************************
+ * 
+ * vdi_fill.c
+ *
+ * Hoover a file or device into a vdi.
+ * You must first create the vdi with vdi_create.
+ *
+ */
+ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "requests-async.h"
+#include "vdi.h"
+
+int main(int argc, char *argv[])
+{
+    vdi_t       *vdi;
+    u64          id;
+    int          fd;
+    struct stat  st;
+    u64          tot_size;
+    char         spage[BLOCK_SIZE];
+    char        *dpage;
+    u64          vblock = 0, count=0;
+    
+    __init_blockstore();
+    init_block_async();
+    __init_vdi();
+    
+    if ( argc < 3 ) {
+        printf("usage: %s <VDI id> <filename>\n", argv[0]);
+        exit(-1);
+    }
+        
+    id = (u64) atoll(argv[1]);
+    
+    vdi = vdi_get( id );
+    
+    if ( vdi == NULL ) {
+        printf("Failed to retreive VDI %Ld!\n", id);
+        exit(-1);
+    }
+    
+    fd = open(argv[2], O_RDONLY | O_LARGEFILE);
+    
+    if (fd < 0) {
+        printf("Couldn't open %s!\n", argv[2]);
+        exit(-1);
+    }
+    
+    if ( fstat(fd, &st) != 0 ) {
+        printf("Couldn't stat %s!\n", argv[2]);
+        exit(-1);
+    }
+    
+    tot_size = (u64) st.st_size;
+    printf("Filling VDI %Ld with %Ld bytes.\n", id, tot_size);
+    
+    printf("%011Ld blocks total\n", tot_size / BLOCK_SIZE);    
+    printf("           ");
+    while ( ( count = read(fd, spage, BLOCK_SIZE) ) > 0 ) {
+        vdi_write_s(vdi, vblock, spage);
+        
+        vblock++;
+        if ((vblock % 512) == 0)
+        printf("\b\b\b\b\b\b\b\b\b\b\b%011Ld", vblock);
+        fflush(stdout);
+    }
+    printf("\n");
+    
+    freeblock(vdi);
+    
+    return (0);
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/radix.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/radix.c     Sun Jul  3 22:36:48 2005
@@ -0,0 +1,631 @@
+/*
+ * Radix tree for mapping (up to) 63-bit virtual block IDs to
+ * 63-bit global block IDs
+ *
+ * Pointers within the tree set aside the least significant bit to indicate
+ * whther or not the target block is writable from this node.
+ *
+ * The block with ID 0 is assumed to be an empty block of all zeros
+ */
+
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <pthread.h>
+#include "blockstore.h"
+#include "radix.h"
+
+#define RADIX_TREE_MAP_SHIFT 9
+#define RADIX_TREE_MAP_MASK 0x1ff
+#define RADIX_TREE_MAP_ENTRIES 512
+
+/*
+#define DEBUG
+*/
+
+/* Experimental radix cache. */
+
+static  pthread_mutex_t rcache_mutex = PTHREAD_MUTEX_INITIALIZER;
+static  int rcache_count = 0;
+#define RCACHE_MAX 1024
+
+typedef struct rcache_st {
+    radix_tree_node  *node;
+    u64               id;
+    struct rcache_st *hash_next;
+    struct rcache_st *cache_next;
+    struct rcache_st *cache_prev;
+} rcache_t;
+
+static rcache_t *rcache_head = NULL;
+static rcache_t *rcache_tail = NULL;
+
+#define RCHASH_SIZE 512ULL
+rcache_t *rcache[RCHASH_SIZE];
+#define RCACHE_HASH(_id) ((_id) & (RCHASH_SIZE - 1))
+
+void __rcache_init(void)
+{
+    int i;
+
+    for (i=0; i<RCHASH_SIZE; i++)
+        rcache[i] = NULL;
+}
+    
+
+void rcache_write(u64 id, radix_tree_node *node)
+{
+    rcache_t *r, *tmp, **curs;
+    
+    pthread_mutex_lock(&rcache_mutex);
+    
+    /* Is it already in the cache? */
+    r = rcache[RCACHE_HASH(id)];
+    
+    for (;;) {
+        if (r == NULL) 
+            break;
+        if (r->id == id) 
+        {
+            memcpy(r->node, node, BLOCK_SIZE);
+            
+            /* bring to front. */
+            if (r != rcache_head) {
+                
+                if (r == rcache_tail) {
+                    if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
+                    rcache_tail->cache_next = NULL;
+                }
+
+                tmp = r->cache_next;
+                if (r->cache_next != NULL) r->cache_next->cache_prev 
+                                                     = r->cache_prev;
+                if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp;
+
+                r->cache_prev = NULL;
+                r->cache_next = rcache_head;
+                if (rcache_head != NULL) rcache_head->cache_prev = r;
+                rcache_head = r;
+            }
+
+//printf("Update (%Ld)\n", r->id);
+            goto done;
+        }
+        r = r->hash_next;
+    }
+    
+    if ( rcache_count == RCACHE_MAX ) 
+    {
+        /* Remove an entry */
+        
+        r = rcache_tail;
+        if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
+        rcache_tail->cache_next = NULL;
+        freeblock(r->node);
+        
+        curs = &rcache[RCACHE_HASH(r->id)];
+        while ((*curs) != r)
+            curs = &(*curs)->hash_next;
+        *curs = r->hash_next;
+//printf("Evict (%Ld)\n", r->id);
+        
+    } else {
+        
+        r = (rcache_t *)malloc(sizeof(rcache_t));
+        rcache_count++;
+    }
+    
+    r->node = newblock();
+    memcpy(r->node, node, BLOCK_SIZE);
+    r->id = id;
+    
+    r->hash_next = rcache[RCACHE_HASH(id)];
+    rcache[RCACHE_HASH(id)] = r;
+    
+    r->cache_prev = NULL;
+    r->cache_next = rcache_head;
+    if (rcache_head != NULL) rcache_head->cache_prev = r;
+    rcache_head = r;
+    if (rcache_tail == NULL) rcache_tail = r;
+    
+//printf("Added (%Ld, %p)\n", id, r->node);
+done:
+    pthread_mutex_unlock(&rcache_mutex);
+}
+
+radix_tree_node *rcache_read(u64 id)
+{
+    rcache_t *r, *tmp;
+    radix_tree_node *node = NULL;
+    
+    pthread_mutex_lock(&rcache_mutex);
+
+    r = rcache[RCACHE_HASH(id)];
+    
+    for (;;) {
+        if (r == NULL) {
+//printf("Miss (%Ld)\n", id);
+            goto done;
+        }
+        if (r->id == id) break;
+        r = r->hash_next;
+    }
+   
+    /* bring to front. */
+    if (r != rcache_head) 
+    {
+        if (r == rcache_tail) {
+            if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
+            rcache_tail->cache_next = NULL;
+        }
+        tmp = r->cache_next;
+        if (r->cache_next != NULL) r->cache_next->cache_prev = r->cache_prev;
+        if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp;
+
+        r->cache_prev = NULL;
+        r->cache_next = rcache_head;
+        if (rcache_head != NULL) rcache_head->cache_prev = r;
+        rcache_head = r;
+    }
+    
+    node = newblock();
+    memcpy(node, r->node, BLOCK_SIZE);
+    
+//printf("Hit (%Ld, %p)\n", id, r->node);
+done:
+    pthread_mutex_unlock(&rcache_mutex);
+    
+    return(node);
+}
+
+
+void *rc_readblock(u64 id)
+{
+    void *ret;
+    
+    ret = (void *)rcache_read(id);
+    
+    if (ret != NULL) return ret;
+    
+    ret = readblock(id);
+    
+    if (ret != NULL)
+        rcache_write(id, ret);
+    
+    return(ret);
+}
+
+u64 rc_allocblock(void *block)
+{
+    u64 ret;
+    
+    ret = allocblock(block);
+    
+    if (ret != ZERO)
+        rcache_write(ret, block);
+    
+    return(ret);
+}
+
+int rc_writeblock(u64 id, void *block)
+{
+    int ret;
+    
+    ret = writeblock(id, block);
+    rcache_write(id, block);
+    
+    return(ret);
+}
+
+
+/*
+ * block device interface and other helper functions
+ * with these functions, block id is just a 63-bit number, with
+ * no special consideration for the LSB
+ */
+radix_tree_node cloneblock(radix_tree_node block);
+
+/*
+ * main api
+ * with these functions, the LSB of root always indicates
+ * whether or not the block is writable, including the return
+ * values of update and snapshot
+ */
+u64 lookup(int height, u64 root, u64 key);
+u64 update(int height, u64 root, u64 key, u64 val);
+u64 snapshot(u64 root);
+
+/**
+ * cloneblock: clone an existing block in memory
+ *   @block: the old block
+ *
+ *   @return: new block, with LSB cleared for every entry
+ */
+radix_tree_node cloneblock(radix_tree_node block) {
+    radix_tree_node node = (radix_tree_node) malloc(BLOCK_SIZE);
+    int i;
+    if (node == NULL) {
+        perror("cloneblock malloc");
+        return NULL;
+    }
+    for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++)
+        node[i] = block[i] & ONEMASK;
+    return node;
+}
+
+/**
+ * lookup: find a value given a key
+ *   @height: height in bits of the radix tree
+ *   @root: root node id, with set LSB indicating writable node
+ *   @key: key to lookup
+ *
+ *   @return: value on success, zero on error
+ */
+
+u64 lookup(int height, u64 root, u64 key) {
+    radix_tree_node node;
+    u64 mask = ONE;
+    
+    assert(key >> height == 0);
+
+    /* the root block may be smaller to ensure all leaves are full */
+    height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
+
+    /* now carve off equal sized chunks at each step */
+    for (;;) {
+        u64 oldroot;
+
+#ifdef DEBUG
+        printf("lookup: height=%3d root=%3Ld offset=%3d%s\n", height, root,
+                (int) ((key >> height) & RADIX_TREE_MAP_MASK),
+                (iswritable(root) ? "" : " (readonly)"));
+#endif
+        
+        if (getid(root) == ZERO)
+            return ZERO;
+
+        oldroot = root;
+        node = (radix_tree_node) rc_readblock(getid(root));
+        if (node == NULL)
+            return ZERO;
+
+        root = node[(key >> height) & RADIX_TREE_MAP_MASK];
+        mask &= root;
+        freeblock(node);
+
+        if (height == 0)
+            return ( root & ONEMASK ) | mask;
+
+        height -= RADIX_TREE_MAP_SHIFT;
+    }
+
+    return ZERO;
+}
+
+/*
+ * update: set a radix tree entry, doing copy-on-write as necessary
+ *   @height: height in bits of the radix tree
+ *   @root: root node id, with set LSB indicating writable node
+ *   @key: key to set
+ *   @val: value to set, s.t. radix(key)=val
+ *
+ *   @returns: (possibly new) root id on success (with LSB=1), 0 on failure
+ */
+
+u64 update(int height, u64 root, u64 key, u64 val) {
+    int offset;
+    u64 child;
+    radix_tree_node node;
+    
+    /* base case--return val */
+    if (height == 0)
+        return val;
+
+    /* the root block may be smaller to ensure all leaves are full */
+    height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
+    offset = (key >> height) & RADIX_TREE_MAP_MASK;
+
+#ifdef DEBUG
+    printf("update: height=%3d root=%3Ld offset=%3d%s\n", height, root,
+            offset, (iswritable(root)?"":" (clone)"));
+#endif
+
+    /* load a block, or create a new one */
+    if (root == ZERO) {
+        node = (radix_tree_node) newblock();
+    } else {
+        node = (radix_tree_node) rc_readblock(getid(root));
+
+        if (!iswritable(root)) {
+            /* need to clone this node */
+            radix_tree_node oldnode = node;
+            node = cloneblock(node);
+            freeblock(oldnode);
+            root = ZERO;
+        }
+    }
+
+    if (node == NULL) {
+#ifdef DEBUG
+        printf("update: node is null!\n");
+#endif
+        return ZERO;
+    }
+
+    child = update(height, node[offset], key, val);
+
+    if (child == ZERO) {
+        freeblock(node);
+        return ZERO;
+    } else if (child == node[offset]) {
+        /* no change, so we already owned the child */
+        assert(iswritable(root));
+
+        freeblock(node);
+        return root;
+    }
+
+    node[offset] = child;
+
+    /* new/cloned blocks need to be saved */
+    if (root == ZERO) {
+        /* mark this as an owned block */
+        root = rc_allocblock(node);
+        if (root)
+            root = writable(root);
+    } else if (rc_writeblock(getid(root), node) < 0) {
+        freeblock(node);
+        return ZERO;
+    }
+
+    freeblock(node);
+    return root;
+}
+
+/**
+ * snapshot: create a snapshot
+ *   @root: old root node
+ *
+ *   @return: new root node, 0 on error
+ */
+u64 snapshot(u64 root) {
+    radix_tree_node node, newnode;
+
+    if ((node = rc_readblock(getid(root))) == NULL)
+        return ZERO;
+
+    newnode = cloneblock(node);
+    freeblock(node);
+    if (newnode == NULL)
+        return ZERO;
+    
+    root = rc_allocblock(newnode);
+    freeblock(newnode);
+
+    if (root == ZERO)
+        return ZERO;
+    else
+        return writable(root);
+}
+
+/**
+ * collapse: collapse a parent onto a child.
+ * 
+ * NOTE: This assumes that parent and child really are, and further that
+ * there are no other children forked from this parent. (children of the
+ * child are okay...)
+ */
+
+int collapse(int height, u64 proot, u64 croot)
+{
+    int i, numlinks, ret, total = 0;
+    radix_tree_node pnode, cnode;
+    
+    if (height == 0) {
+        height = -1; /* terminate recursion */
+    } else {        
+        height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
+    }
+    numlinks = (1UL << RADIX_TREE_MAP_SHIFT);
+
+    /* Terminal cases: */
+
+    if ( (getid(proot) == ZERO) || (getid(croot) == ZERO) )
+        return -1;
+    
+    /* get roots */
+    if ((pnode = readblock(getid(proot))) == NULL)
+        return -1;
+    
+    if ((cnode = readblock(getid(croot))) == NULL)
+    {
+        freeblock(pnode);
+        return -1;
+    }
+    
+    /* For each writable link in proot */
+    for (i=0; i<numlinks; i++)
+    {
+        if ( pnode[i] == cnode[i] ) continue;
+        
+        /* collapse (next level) */
+        /* if height != 0 and writable... */
+        if (( height >= 0 ) && ( iswritable(pnode[i]) ) )
+        {
+            //printf("   %Ld is writable (i=%d).\n", getid(pnode[i]), i);
+            ret = collapse(height, pnode[i], cnode[i]);
+            if (ret == -1) 
+            {
+                total = -1;
+            } else {
+                total += ret;
+            }
+        }
+    
+        
+    }
+    
+    /* if plink is writable, AND clink is writable -> free plink block */
+    if ( ( iswritable(proot) ) && ( iswritable(croot) ) ) 
+    {
+        releaseblock(getid(proot));
+        if (ret >=0) total++;
+        //printf("   Delete %Ld\n", getid(proot));
+    }
+//printf("done : %Ld\n", getid(proot));
+    return total;
+
+}
+
+
+void print_root(u64 root, int height, FILE *dot_f)
+{
+    FILE *f;
+    int i;
+    radix_tree_node node;
+    char *style[2] = { "", "style=bold,color=blue," };
+    
+    if (dot_f == NULL) {
+        f = fopen("radix.dot", "w");
+        if (f == NULL) {
+            perror("print_root: open");
+            return;
+        }
+
+        /* write graph preamble */
+        fprintf(f, "digraph G {\n");
+
+        /* add a node for this root. */
+        fprintf(f, "   n%Ld [%sshape=box,label=\"%Ld\"];\n", 
+                getid(root), style[iswritable(root)], getid(root));
+    }
+    
+    printf("print_root(%Ld)\n", getid(root));
+    
+    /* base case */
+    if (height == 0) {
+        /* add a node and edge for each child root */
+        node = (radix_tree_node) readblock(getid(root));
+        if (node == NULL)
+            return;
+        
+        for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++) {
+            if (node[i] != ZERO) {
+                fprintf(f, "   n%Ld [%sshape=box,label=\"%Ld\"];\n", 
+                        getid(node[i]), style[iswritable(node[i])], 
+                        getid(node[i]));
+                fprintf(f, "   n%Ld -> n%Ld [label=\"%d\"]\n", getid(root), 
+                        getid(node[i]), i);
+            }
+        }
+        freeblock(node);
+        return;
+    }
+
+    /* the root block may be smaller to ensure all leaves are full */
+    height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
+
+    if (getid(root) == ZERO)
+        return;
+
+    node = (radix_tree_node) readblock(getid(root));
+    if (node == NULL)
+        return;
+
+    /* add a node and edge for each child root */
+    for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++)
+        if (node[i] != ZERO) {
+            fprintf(f, "   n%Ld [%sshape=box,label=\"%Ld\"];\n", 
+                    getid(node[i]), style[iswritable(node[i])], 
+                    getid(node[i]));
+
+            print_root(node[i], height-RADIX_TREE_MAP_SHIFT, f);
+            fprintf(f, "   n%Ld -> n%Ld [label=\"%d\"]\n", getid(root), 
+                    getid(node[i]), i);
+        }
+
+    freeblock(node);
+    
+    /* write graph postamble */
+    if (dot_f == NULL) {
+        fprintf(f, "}\n");
+        fclose(f);
+    }
+}
+
+#ifdef RADIX_STANDALONE
+
+int main(int argc, char **argv) {
+    u64 key = ZERO, val = ZERO;
+    u64 root = writable(2ULL);
+    u64 p = ZERO, c = ZERO;
+    int v;
+    char buff[4096];
+
+    __init_blockstore();
+    
+    memset(buff, 0, 4096);
+    /*fp = open("radix.dat", O_RDWR | O_CREAT, 0644);
+
+    if (fp < 3) {
+        perror("open");
+        return -1;
+    }
+    if (lseek(fp, 0, SEEK_END) == 0) {
+        write(fp, buff, 4096);
+    }*/
+        
+    allocblock(buff);
+            
+    printf("Recognized commands:\n"
+           "Note: the LSB of a node number indicates if it is writable\n"
+           "  root <node>               set root to <node>\n"
+           "  snapshot                  take a snapshot of the root\n"
+           "  set <key> <val>           set key=val\n"
+           "  get <key>                 query key\n"
+           "  c <proot> <croot>         collapse\n"
+           "  pr                        print tree to dot\n"
+           "  pf <1=verbose>            print freelist\n"
+           "  quit\n"
+           "\nroot = %Ld\n", root);
+    for (;;) {
+        //print_root(root, 34, NULL);
+        //system("dot radix.dot -Tps -o radix.ps");
+
+        printf("> ");
+        fflush(stdout);
+        fgets(buff, 1024, stdin);
+        if (feof(stdin))
+            break;
+        if (sscanf(buff, " root %Ld", &root) == 1) {
+            printf("root set to %Ld\n", root);
+        } else if (sscanf(buff, " set %Ld %Ld", &key, &val) == 2) {
+            root = update(34, root, key, val);
+            printf("root = %Ld\n", root);
+        } else if (sscanf(buff, " c %Ld %Ld", &p, &c) == 2) {
+            v = collapse(34, p, c);
+            printf("reclaimed %d blocks.\n", v);
+        } else if (sscanf(buff, " get %Ld", &key) == 1) {
+            val = lookup(34, root, key);
+            printf("value = %Ld\n", val);
+        } else if (!strcmp(buff, "quit\n")) {
+            break;
+        } else if (!strcmp(buff, "snapshot\n")) {
+            root = snapshot(root);
+            printf("new root = %Ld\n", root);
+        } else if (sscanf(buff, " pr %Ld", &root) == 1) {
+            print_root(root, 34, NULL);
+        } else if (sscanf(buff, " pf %d", &v) == 1) {
+            freelist_count(v);
+        } else if (!strcmp(buff, "pf\n")) {
+            freelist_count(0);
+        } else {
+            printf("command not recognized\n");
+        }
+    }
+    return 0;
+}
+
+#endif
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/radix.h
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/radix.h     Sun Jul  3 22:36:48 2005
@@ -0,0 +1,45 @@
+/*
+ * Radix tree for mapping (up to) 63-bit virtual block IDs to
+ * 63-bit global block IDs
+ *
+ * Pointers within the tree set aside the least significant bit to indicate
+ * whther or not the target block is writable from this node.
+ *
+ * The block with ID 0 is assumed to be an empty block of all zeros
+ */
+
+#ifndef __RADIX_H__
+#define __RADIX_H__
+
+/* I don't really like exposing these, but... */
+#define getid(x) (((x)>>1)&0x7fffffffffffffffLL)
+#define putid(x) ((x)<<1)
+#define writable(x) (((x)<<1)|1LL)
+#define iswritable(x) ((x)&1LL)
+#define ZERO 0LL
+#define ONE 1LL
+#define ONEMASK 0xffffffffffffffeLL
+
+#define RADIX_TREE_MAP_SHIFT 9
+#define RADIX_TREE_MAP_MASK 0x1ff
+#define RADIX_TREE_MAP_ENTRIES 512
+
+typedef u64 *radix_tree_node;
+
+
+/*
+ * main api
+ * with these functions, the LSB of root always indicates
+ * whether or not the block is writable, including the return
+ * values of update and snapshot
+ */
+u64 lookup(int height, u64 root, u64 key);
+u64 update(int height, u64 root, u64 key, u64 val);
+u64 snapshot(u64 root);
+int collapse(int height, u64 proot, u64 croot);
+int isprivate(int height, u64 root, u64 key);
+
+
+void __rcache_init(void);
+
+#endif /* __RADIX_H__ */
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/blockstored.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/blockstored.c       Sun Jul  3 22:36:48 2005
@@ -0,0 +1,276 @@
+/**************************************************************************
+ * 
+ * blockstored.c
+ *
+ * Block store daemon.
+ *
+ */
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <netinet/in.h>
+#include <errno.h>
+#include "blockstore.h"
+
+//#define BSDEBUG
+
+int readblock_into(u64 id, void *block);
+
+int open_socket(u16 port) {
+    
+    struct sockaddr_in sn;
+    int sock;
+
+    sock = socket(AF_INET, SOCK_DGRAM, 0);
+    if (sock < 0) {
+        perror("Bad socket");
+        return -1;
+    }
+    memset(&sn, 0, sizeof(sn));
+    sn.sin_family = AF_INET;
+    sn.sin_port = htons(port);
+    sn.sin_addr.s_addr = htonl(INADDR_ANY);
+    if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) {
+        perror("bind");
+        close(sock);
+        return -1;
+    }
+
+    return sock;
+}
+
+static int block_fp = -1;
+static int bssock = -1;
+
+int send_reply(struct sockaddr_in *peer, void *buffer, int len) {
+
+    int rc;
+    
+#ifdef BSDEBUG
+    fprintf(stdout, "TX: %u bytes op=%u id=0x%llx\n",
+            len, ((bsmsg_t *)buffer)->hdr.operation, ((bsmsg_t 
*)buffer)->hdr.id);
+#endif
+    rc = sendto(bssock, buffer, len, 0, (struct sockaddr *)peer, 
sizeof(*peer));
+    if (rc < 0) {
+        perror("send_reply");
+        return 1;
+    }
+
+
+    return 0;
+}
+
+static bsmsg_t msgbuf;
+
+void service_loop(void) {
+
+    for (;;) {
+        int rc, len;
+        struct sockaddr_in from;
+        size_t slen = sizeof(from);
+        u64 bid;
+
+        len = recvfrom(bssock, (void *)&msgbuf, sizeof(msgbuf), 0,
+                       (struct sockaddr *)&from, &slen);
+
+        if (len < 0) {
+            perror("recvfrom");
+            continue;
+        }
+
+        if (len < MSGBUFSIZE_OP) {
+            fprintf(stderr, "Short packet.\n");
+            continue;
+        }
+
+#ifdef BSDEBUG
+        fprintf(stdout, "RX: %u bytes op=%u id=0x%llx\n",
+                len, msgbuf.hdr.operation, msgbuf.hdr.id);
+#endif
+
+        switch (msgbuf.hdr.operation) {
+        case BSOP_READBLOCK:
+            if (len < MSGBUFSIZE_ID) {
+                fprintf(stderr, "Short packet (readblock %u).\n", len);
+                continue;
+            }
+            rc = readblock_into(msgbuf.hdr.id, msgbuf.block);
+            if (rc < 0) {
+                fprintf(stderr, "readblock error\n");
+                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
+                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
+                continue;
+            }
+            msgbuf.hdr.flags = 0;
+            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_BLOCK);
+            break;
+        case BSOP_WRITEBLOCK:
+            if (len < MSGBUFSIZE_BLOCK) {
+                fprintf(stderr, "Short packet (writeblock %u).\n", len);
+                continue;
+            }
+            rc = writeblock(msgbuf.hdr.id, msgbuf.block);
+            if (rc < 0) {
+                fprintf(stderr, "writeblock error\n");
+                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
+                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
+                continue;
+            }
+            msgbuf.hdr.flags = 0;
+            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
+            break;
+        case BSOP_ALLOCBLOCK:
+            if (len < MSGBUFSIZE_BLOCK) {
+                fprintf(stderr, "Short packet (allocblock %u).\n", len);
+                continue;
+            }
+            bid = allocblock(msgbuf.block);
+            if (bid == ALLOCFAIL) {
+                fprintf(stderr, "allocblock error\n");
+                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
+                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
+                continue;
+            }
+            msgbuf.hdr.id = bid;
+            msgbuf.hdr.flags = 0;
+            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
+            break;
+        }
+
+    }
+}
+ 
+/**
+ * readblock: read a block from disk
+ *   @id: block id to read
+ *   @block: pointer to buffer to receive block
+ *
+ *   @return: 0 if OK, other on error
+ */
+
+int readblock_into(u64 id, void *block) {
+    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
+        printf ("%Ld\n", (id - 1) * BLOCK_SIZE);
+        perror("readblock lseek");
+        return -1;
+    }
+    if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
+        perror("readblock read");
+        return -1;
+    }
+    return 0;
+}
+
+/**
+ * writeblock: write an existing block to disk
+ *   @id: block id
+ *   @block: pointer to block
+ *
+ *   @return: zero on success, -1 on failure
+ */
+int writeblock(u64 id, void *block) {
+    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
+        perror("writeblock lseek");
+        return -1;
+    }
+    if (write(block_fp, block, BLOCK_SIZE) < 0) {
+        perror("writeblock write");
+        return -1;
+    }
+    return 0;
+}
+
+/**
+ * allocblock: write a new block to disk
+ *   @block: pointer to block
+ *
+ *   @return: new id of block on disk
+ */
+static u64 lastblock = 0;
+
+u64 allocblock(void *block) {
+    u64 lb;
+    off64_t pos;
+
+    retry:
+    pos = lseek64(block_fp, 0, SEEK_END);
+    if (pos == (off64_t)-1) {
+        perror("allocblock lseek");
+        return ALLOCFAIL;
+    }
+    if (pos % BLOCK_SIZE != 0) {
+        fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE);
+        return ALLOCFAIL;
+    }
+    if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
+        perror("allocblock write");
+        return ALLOCFAIL;
+    }
+    lb = pos / BLOCK_SIZE + 1;
+
+#ifdef BS_ALLOC_HACK
+    if (lb < BS_ALLOC_SKIP)
+        goto retry;
+#endif
+    
+    if (lb <= lastblock)
+        printf("[*** %Ld alredy allocated! ***]\n", lb);
+    
+    lastblock = lb;
+    return lb;
+}
+
+/**
+ * newblock: get a new in-memory block set to zeros
+ *
+ *   @return: pointer to new block, NULL on error
+ */
+void *newblock() {
+    void *block = malloc(BLOCK_SIZE);
+    if (block == NULL) {
+        perror("newblock");
+        return NULL;
+    }
+    memset(block, 0, BLOCK_SIZE);
+    return block;
+}
+
+
+/**
+ * freeblock: unallocate an in-memory block
+ *   @id: block id (zero if this is only in-memory)
+ *   @block: block to be freed
+ */
+void freeblock(void *block) {
+    if (block != NULL)
+        free(block);
+}
+
+
+int main(int argc, char **argv)
+{
+    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
+
+    if (block_fp < 0) {
+        perror("open");
+        return -1;
+    }
+
+    bssock = open_socket(BLOCKSTORED_PORT);
+    if (bssock < 0) {
+        return -1;
+    }
+
+    service_loop();
+    
+    close(bssock);
+
+    return 0;
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/README-PARALLAX
--- a/tools/blktap/README-PARALLAX      Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,177 +0,0 @@
-Parallax Quick Overview
-March 3, 2005
-
-This is intended to provide a quick set of instructions to let you
-guys play with the current parallax source.  In it's current form, the
-code will let you run an arbitrary number of VMs off of a single disk
-image, doing copy-on-write as they make updates.  Each domain is
-assigned a virtual disk image (VDI), which may be based on a snapshot
-of an existing image.  All of the VDI and snapshot management should
-currently work.
-
-The current implementation uses a single file as a blockstore for
-_everything_ this will soon be replaced by the fancier backend code
-and the local cache.  As it stands, Parallax will create
-"blockstore.dat" in the directory that you run it from, and use
-largefile support to make this grow to unfathomable girth.  So, you
-probably want to run the daemon off of a local disk, with a lot of
-free space.
-
-Here's how to get going:
-
-0. Setup:
----------
-
-Pick a local directory on a disk with lots of room.  You should be
-running from a privileged domain (e.g. dom0) with the blocktap
-configured in and block backend NOT.
-
-For convenience (for the moment) copy all of the vdi tools (vdi_*) and
-the parallax daemon from tools/blktap into this directory.
-
-1. Populate the blockstore:
----------------------------
-
-First you need to put at least one image into the blockstore.  You
-will need a disk image, either as a file or local partition.  My
-general approach has been to
-
-(a) make a really big sparse file with 
-
-        dd if=/dev/zero of=./image bs=4K count=1 seek=[big value]
-
-(b) put a filesystem into it
-
-        mkfs.ext3 ./image
-
-(c) mount it using loopback
-
-        mkdir ./mnt
-        mount -o loop ./image
-
-(d) cd into it and untar one of the image files from srg-roots.
-
-        cd mnt
-        tar ...
-
-NOTE: Beware if your system is FC3.  mkfs is not compatible with old
-versions of fedora, and so you don't have much choice but to install
-further fc3 images if you have used the fc3 version of mkfs.
-
-(e) unmount the image
-
-        cd ..
-        umount mnt
-
-(f) now, create a new VDI to hold the image 
-
-        ./vdi_create "My new FC3 VDI"
-
-(g) get the id of the new VDI.
-
-        ./vdi_list
-
-        |      0                     My new FC3 VDI
-
-(0 is the VDI id... create a few more if you want.)
-
-(h) hoover your image into the new VDI.
-
-        ./vdi_fill 0 ./image
-
-This will pull the entire image into the blockstore and set up a
-mapping tree for it for VDI 0.  Passing a device (i.e. /dev/sda3)
-should also work, but vdi_fill has NO notion of sparseness yet, so you
-are going to pump a block into the store for each block you read.
-
-vdi_fill will count up until it is done, and you should be ready to
-go.  If you want to be anal, you can use vdi_validate to test the VDI
-against the original image.
-
-2. Create some extra VDIs
--------------------------
-
-VDIs are actually a list of snapshots, and each snapshot is a full
-image of mappings.  So, to preserve an immutable copy of a current
-VDI, do this:
-
-(a) Snapshot your new VDI.
-
-        ./vdi_snap 0
-
-Snapshotting writes the current radix root to the VDI's snapshot log,
-and assigns it a new writable root.
-
-(b) look at the VDI's snapshot log.
-
-        ./vdi_snap_list 0
-
-        | 16   0      Thu Mar  3 19:27:48 2005 565111           31
-
-The first two columns constitute a snapshot id and represent the
-(block, offset) of the snapshot record.  The Date tells you when the
-snapshot was made, and 31 is the radix root node of the snapshot.
-
-(c) Create a new VDI, based on that snapshot, and look at the list.
-
-        ./vdi_create "FC3 - Copy 1" 16 0
-        ./vdi_list
-
-        |      0                     My new FC3 VDI
-        |      1                       FC3 - Copy 1
-
-NOTE: If you have Graphviz installed on your system, you can use
-vdi_tree to generate a postscript of your current set of VDIs and
-snapshots.
-
-
-Create as many VDIs as you need for the VMs that you want to run.
-
-3. Boot some VMs:
------------------
-
-Parallax currently uses a hack in xend to pass the VDI id, you need to
-modify the disk line of the VM config that is going to mount it.
-
-(a) set up your vm config, by using the following disk line:
-
-        disk = ['parallax:1,sda1,w,0' ]
-
-This example uses VDI 1 (from vdi_list above), presents it as sda1
-(writable), and uses dom 0 as the backend.  If you were running the
-daemon (and tap driver) in some domain other than 0, you would change
-this last parameter.
-
-NOTE: You'll need to have reinstalled xend/tools prior to booting the vm, so 
that it knows what to do with "parallax:".
-
-(b) Run parallax in the backend domain.
-
-        ./parallax
-
-(c) create your new domain.
-
-        xm create ...
-
----
-
-That's pretty much all there is to it at the moment.  Hope this is
-clear enough to get you going.  Now, a few serious caveats that will
-be sorted out in the almost immediate future:
-
-WARNINGS:
----------
-
-1. There is NO locking in the VDI tools at the moment, so I'd avoid
-running them in parallel, or more importantly, running them while the
-daemon is running.
-
-2. I doubt that xend will be very happy about restarting if you have
-parallax-using domains.  So if it dies while there are active parallax
-doms, you may need to reboot.
-
-3. I've turned off write-in-place.  So at the moment, EVERY block
-write is a log append on the blockstore.  I've been having some probs
-with the radix tree's marking of writable blocks after snapshots and
-will sort this out very soon.
-
-
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/block-async.c
--- a/tools/blktap/block-async.c        Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,393 +0,0 @@
-/* block-async.c
- * 
- * Asynchronous block wrappers for parallax.
- */
- 
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <pthread.h>
-#include "block-async.h"
-#include "blockstore.h"
-#include "vdi.h"
-
-
-#if 0
-#define DPRINTF(_f, _a...) printf ( _f , ## _a )
-#else
-#define DPRINTF(_f, _a...) ((void)0)
-#endif
-
-/* We have a queue of outstanding I/O requests implemented as a 
- * circular producer-consumer ring with free-running buffers.
- * to allow reordering, this ring indirects to indexes in an 
- * ring of io_structs.
- * 
- * the block_* calls may either add an entry to this ring and return, 
- * or satisfy the request immediately and call the callback directly.
- * None of the io calls in parallax should be nested enough to worry 
- * about stack problems with this approach.
- */
-
-struct read_args {
-    u64 addr;
-};
-
-struct write_args {
-    u64   addr;
-    char *block;
-};
-
-struct alloc_args {
-    char *block;
-};
- 
-struct pending_io_req {
-    enum {IO_READ, IO_WRITE, IO_ALLOC, IO_RWAKE, IO_WWAKE} op;
-    union {
-        struct read_args  r;
-        struct write_args w;
-        struct alloc_args a;
-    } u;
-    io_cb_t cb;
-    void *param;
-};
-
-void radix_lock_init(struct radix_lock *r)
-{
-    int i;
-    
-    pthread_mutex_init(&r->lock, NULL);
-    for (i=0; i < 1024; i++) {
-        r->lines[i] = 0;
-        r->waiters[i] = NULL;
-        r->state[i] = ANY;
-    }
-}
-
-/* maximum outstanding I/O requests issued asynchronously */
-/* must be a power of 2.*/
-#define MAX_PENDING_IO 1024
-
-/* how many threads to concurrently issue I/O to the disk. */
-#define IO_POOL_SIZE   10
-
-static struct pending_io_req pending_io_reqs[MAX_PENDING_IO];
-static int pending_io_list[MAX_PENDING_IO];
-static unsigned long io_prod = 0, io_cons = 0, io_free = 0;
-#define PENDING_IO_MASK(_x) ((_x) & (MAX_PENDING_IO - 1))
-#define PENDING_IO_IDX(_x) ((_x) - pending_io_reqs)
-#define PENDING_IO_ENT(_x) \
-       (&pending_io_reqs[pending_io_list[PENDING_IO_MASK(_x)]])
-#define CAN_PRODUCE_PENDING_IO ((io_free + MAX_PENDING_IO) != io_prod)
-#define CAN_CONSUME_PENDING_IO (io_cons != io_prod)
-static pthread_mutex_t pending_io_lock = PTHREAD_MUTEX_INITIALIZER;
-static pthread_cond_t  pending_io_cond = PTHREAD_COND_INITIALIZER;
-
-static void init_pending_io(void)
-{
-    int i;
-       
-    for (i=0; i<MAX_PENDING_IO; i++)
-        pending_io_list[i] = i;
-               
-} 
-
-void block_read(u64 addr, io_cb_t cb, void *param)
-{
-    struct pending_io_req *req;
-    
-    pthread_mutex_lock(&pending_io_lock);
-    assert(CAN_PRODUCE_PENDING_IO);
-    
-    req = PENDING_IO_ENT(io_prod++);
-    DPRINTF("Produce (R) %lu (%p)\n", io_prod - 1, req);
-    req->op = IO_READ;
-    req->u.r.addr = addr;
-    req->cb = cb;
-    req->param = param;
-    
-    pthread_cond_signal(&pending_io_cond);
-    pthread_mutex_unlock(&pending_io_lock);    
-}
-
-
-void block_write(u64 addr, char *block, io_cb_t cb, void *param)
-{
-    struct pending_io_req *req;
-    
-    pthread_mutex_lock(&pending_io_lock);
-    assert(CAN_PRODUCE_PENDING_IO);
-    
-    req = PENDING_IO_ENT(io_prod++);
-    DPRINTF("Produce (W) %lu (%p)\n", io_prod - 1, req);
-    req->op = IO_WRITE;
-    req->u.w.addr  = addr;
-    req->u.w.block = block;
-    req->cb = cb;
-    req->param = param;
-    
-    pthread_cond_signal(&pending_io_cond);
-    pthread_mutex_unlock(&pending_io_lock);    
-}
-
-
-void block_alloc(char *block, io_cb_t cb, void *param)
-{
-    struct pending_io_req *req;
-       
-    pthread_mutex_lock(&pending_io_lock);
-    assert(CAN_PRODUCE_PENDING_IO);
-    
-    req = PENDING_IO_ENT(io_prod++);
-    req->op = IO_ALLOC;
-    req->u.a.block = block;
-    req->cb = cb;
-    req->param = param;
-    
-    pthread_cond_signal(&pending_io_cond);
-    pthread_mutex_unlock(&pending_io_lock);    
-}
-
-void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
-{
-    struct io_ret ret;
-    pthread_mutex_lock(&r->lock);
-    
-    if (( r->lines[row] >= 0 ) && (r->state[row] != STOP)) {
-        r->lines[row]++;
-        r->state[row] = READ;
-        DPRINTF("RLOCK  : %3d (row: %d)\n", r->lines[row], row);
-        pthread_mutex_unlock(&r->lock);
-        ret.type = IO_INT_T;
-        ret.u.i = 0;
-        cb(ret, param);
-    } else {
-        struct radix_wait **rwc;
-        struct radix_wait *rw = 
-            (struct radix_wait *) malloc (sizeof(struct radix_wait));
-        DPRINTF("RLOCK  : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row);
-        rw->type  = RLOCK;
-        rw->param = param;
-        rw->cb    = cb;
-        rw->next  = NULL;
-        /* append to waiters list. */
-        rwc = &r->waiters[row];
-        while (*rwc != NULL) rwc = &(*rwc)->next;
-        *rwc = rw;
-        pthread_mutex_unlock(&r->lock);
-        return;
-    }
-}
-
-
-void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
-{
-    struct io_ret ret;
-    pthread_mutex_lock(&r->lock);
-    
-    /* the second check here is redundant -- just here for debugging now. */
-    if ((r->state[row] == ANY) && ( r->lines[row] == 0 )) {
-        r->state[row] = STOP;
-        r->lines[row] = -1;
-        DPRINTF("WLOCK  : %3d (row: %d)\n", r->lines[row], row);
-        pthread_mutex_unlock(&r->lock);
-        ret.type = IO_INT_T;
-        ret.u.i = 0;
-        cb(ret, param);
-    } else {
-        struct radix_wait **rwc;
-        struct radix_wait *rw = 
-            (struct radix_wait *) malloc (sizeof(struct radix_wait));
-        DPRINTF("WLOCK  : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row);
-        rw->type  = WLOCK;
-        rw->param = param;
-        rw->cb    = cb;
-        rw->next  = NULL;
-        /* append to waiters list. */
-        rwc = &r->waiters[row];
-        while (*rwc != NULL) rwc = &(*rwc)->next;
-        *rwc = rw;
-        pthread_mutex_unlock(&r->lock);
-        return;
-    }
-       
-}
-
-/* called with radix_lock locked and lock count of zero. */
-static void wake_waiters(struct radix_lock *r, int row)
-{
-    struct pending_io_req *req;
-    struct radix_wait *rw;
-    
-    if (r->lines[row] != 0) return;
-    if (r->waiters[row] == NULL) return; 
-    
-    if (r->waiters[row]->type == WLOCK) {
-
-        rw = r->waiters[row];
-        pthread_mutex_lock(&pending_io_lock);
-        assert(CAN_PRODUCE_PENDING_IO);
-        
-        req = PENDING_IO_ENT(io_prod++);
-        req->op    = IO_WWAKE;
-        req->cb    = rw->cb;
-        req->param = rw->param;
-        r->lines[row] = -1; /* write lock the row. */
-        r->state[row] = STOP;
-        r->waiters[row] = rw->next;
-        free(rw);
-        pthread_mutex_unlock(&pending_io_lock);
-    
-    } else /* RLOCK */ {
-
-        while ((r->waiters[row] != NULL) && (r->waiters[row]->type == RLOCK)) {
-            rw = r->waiters[row];
-            pthread_mutex_lock(&pending_io_lock);
-            assert(CAN_PRODUCE_PENDING_IO);
-            
-            req = PENDING_IO_ENT(io_prod++);
-            req->op    = IO_RWAKE;
-            req->cb    = rw->cb;
-            req->param = rw->param;
-            r->lines[row]++; /* read lock the row. */
-            r->state[row] = READ; 
-            r->waiters[row] = rw->next;
-            free(rw);
-            pthread_mutex_unlock(&pending_io_lock);
-        }
-
-        if (r->waiters[row] != NULL) /* There is a write queued still */
-            r->state[row] = STOP;
-    }  
-    
-    pthread_mutex_lock(&pending_io_lock);
-    pthread_cond_signal(&pending_io_cond);
-    pthread_mutex_unlock(&pending_io_lock);
-}
-
-void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
-{
-    struct io_ret ret;
-       
-    pthread_mutex_lock(&r->lock);
-    assert(r->lines[row] > 0); /* try to catch misuse. */
-    r->lines[row]--;
-    if (r->lines[row] == 0) {
-        r->state[row] = ANY;
-        wake_waiters(r, row);
-    }
-    pthread_mutex_unlock(&r->lock);
-    cb(ret, param);
-}
-
-void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
-{
-    struct io_ret ret;
-    
-    pthread_mutex_lock(&r->lock);
-    assert(r->lines[row] == -1); /* try to catch misuse. */
-    r->lines[row] = 0;
-    r->state[row] = ANY;
-    wake_waiters(r, row);
-    pthread_mutex_unlock(&r->lock);
-    cb(ret, param);
-}
-
-/* consumer calls */
-static void do_next_io_req(struct pending_io_req *req)
-{
-    struct io_ret          ret;
-    void  *param;
-    
-    switch (req->op) {
-    case IO_READ:
-        ret.type = IO_BLOCK_T;
-        ret.u.b  = readblock(req->u.r.addr);
-        break;
-    case IO_WRITE:
-        ret.type = IO_INT_T;
-        ret.u.i  = writeblock(req->u.w.addr, req->u.w.block);
-        DPRINTF("wrote %d at %Lu\n", *(int *)(req->u.w.block), req->u.w.addr);
-        break;
-    case IO_ALLOC:
-        ret.type = IO_ADDR_T;
-        ret.u.a  = allocblock(req->u.a.block);
-        break;
-    case IO_RWAKE:
-        DPRINTF("WAKE DEFERRED RLOCK!\n");
-        ret.type = IO_INT_T;
-        ret.u.i  = 0;
-        break;
-    case IO_WWAKE:
-        DPRINTF("WAKE DEFERRED WLOCK!\n");
-        ret.type = IO_INT_T;
-        ret.u.i  = 0;
-        break;
-    default:
-        DPRINTF("Unknown IO operation on pending list!\n");
-        return;
-    }
-    
-    param = req->param;
-    pthread_mutex_lock(&pending_io_lock);
-    pending_io_list[PENDING_IO_MASK(io_free++)] = PENDING_IO_IDX(req);
-    pthread_mutex_unlock(&pending_io_lock);
-       
-    assert(req->cb != NULL);
-    req->cb(ret, param);
-    
-}
-
-void *io_thread(void *param) 
-{
-    int tid;
-    struct pending_io_req *req;
-    
-    /* Set this thread's tid. */
-    tid = *(int *)param;
-    free(param);
-    
-start:
-    pthread_mutex_lock(&pending_io_lock);
-    while (io_prod == io_cons) {
-        pthread_cond_wait(&pending_io_cond, &pending_io_lock);
-    }
-    
-    if (io_prod == io_cons) {
-        /* unnecessary wakeup. */
-        pthread_mutex_unlock(&pending_io_lock);
-        goto start;
-    }
-    
-    req = PENDING_IO_ENT(io_cons++);
-    pthread_mutex_unlock(&pending_io_lock);
-       
-    do_next_io_req(req);
-    
-    goto start;
-       
-}
-
-static pthread_t io_pool[IO_POOL_SIZE];
-void start_io_threads(void)
-
-{      
-    int i, tid=0;
-    
-    for (i=0; i < IO_POOL_SIZE; i++) {
-        int ret, *t;
-        t = (int *)malloc(sizeof(int));
-        *t = tid++;
-        ret = pthread_create(&io_pool[i], NULL, io_thread, t);
-        if (ret != 0) printf("Error starting thread %d\n", i);
-    }
-       
-}
-
-void init_block_async(void)
-{
-    init_pending_io();
-    start_io_threads();
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/block-async.h
--- a/tools/blktap/block-async.h        Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,69 +0,0 @@
-/* block-async.h
- * 
- * Asynchronous block wrappers for parallax.
- */
- 
-#ifndef _BLOCKASYNC_H_
-#define _BLOCKASYNC_H_
-
-#include <assert.h>
-#include <xc.h>
-#include "vdi.h"
-
-struct io_ret
-{
-    enum {IO_ADDR_T, IO_BLOCK_T, IO_INT_T} type;
-    union {
-        u64   a;
-        char *b;
-        int   i;
-    } u;
-};
-
-typedef void (*io_cb_t)(struct io_ret r, void *param);
-
-/* per-vdi lock structures to make sure requests run in a safe order. */
-struct radix_wait {
-    enum {RLOCK, WLOCK} type;
-    io_cb_t  cb;
-    void    *param;
-    struct radix_wait *next;
-};
-
-struct radix_lock {
-    pthread_mutex_t lock;
-    int                    lines[1024];
-    struct radix_wait     *waiters[1024];
-    enum {ANY, READ, STOP} state[1024];
-};
-void radix_lock_init(struct radix_lock *r);
-
-void block_read(u64 addr, io_cb_t cb, void *param);
-void block_write(u64 addr, char *block, io_cb_t cb, void *param);
-void block_alloc(char *block, io_cb_t cb, void *param);
-void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
-void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
-void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
-void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
-void init_block_async(void);
-
-static inline u64 IO_ADDR(struct io_ret r)
-{
-    assert(r.type == IO_ADDR_T);
-    return r.u.a;
-}
-
-static inline char *IO_BLOCK(struct io_ret r)
-{
-    assert(r.type == IO_BLOCK_T);
-    return r.u.b;
-}
-
-static inline int IO_INT(struct io_ret r)
-{
-    assert(r.type == IO_INT_T);
-    return r.u.i;
-}
-
-
-#endif //_BLOCKASYNC_H_
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/blockstore.c
--- a/tools/blktap/blockstore.c Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,1350 +0,0 @@
-/**************************************************************************
- * 
- * blockstore.c
- *
- * Simple block store interface
- *
- */
- 
-#include <fcntl.h>
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <stdarg.h>
-#include "blockstore.h"
-#include <pthread.h>
-
-//#define BLOCKSTORE_REMOTE
-//#define BSDEBUG
-
-#define RETRY_TIMEOUT 1000000 /* microseconds */
-
-/*****************************************************************************
- * Debugging
- */
-#ifdef BSDEBUG
-void DB(char *format, ...)
-{
-    va_list args;
-    fprintf(stderr, "[%05u] ", (int)pthread_getspecific(tid_key));
-    va_start(args, format);
-    vfprintf(stderr, format, args);
-    va_end(args);
-}
-#else
-#define DB(format, ...) (void)0
-#endif
-
-#ifdef BLOCKSTORE_REMOTE
-
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#include <netinet/in.h>
-#include <netdb.h>
-
-/*****************************************************************************
- * Network state                                                             *
- *****************************************************************************/
-
-/* The individual disk servers we talks to. These will be referenced by
- * an integer index into bsservers[].
- */
-bsserver_t bsservers[MAX_SERVERS];
-
-/* The cluster map. This is indexed by an integer cluster number.
- */
-bscluster_t bsclusters[MAX_CLUSTERS];
-
-/* Local socket.
- */
-struct sockaddr_in sin_local;
-int bssock = 0;
-
-/*****************************************************************************
- * Notification                                                              *
- *****************************************************************************/
-
-typedef struct pool_thread_t_struct {
-    pthread_mutex_t ptmutex;
-    pthread_cond_t ptcv;
-    int newdata;
-} pool_thread_t;
-
-pool_thread_t pool_thread[READ_POOL_SIZE+1];
-
-#define RECV_NOTIFY(tid) { \
-    pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \
-    pool_thread[tid].newdata = 1; \
-    DB("CV Waking %u", tid); \
-    pthread_cond_signal(&(pool_thread[tid].ptcv)); \
-    pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); }
-#define RECV_AWAIT(tid) { \
-    pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \
-    if (pool_thread[tid].newdata) { \
-        pool_thread[tid].newdata = 0; \
-        DB("CV Woken %u", tid); \
-    } \
-    else { \
-        DB("CV Waiting %u", tid); \
-        pthread_cond_wait(&(pool_thread[tid].ptcv), \
-                          &(pool_thread[tid].ptmutex)); \
-    } \
-    pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); }
-
-/*****************************************************************************
- * Message queue management                                                  *
- *****************************************************************************/
-
-/* Protects the queue manipulation critcal regions.
- */
-pthread_mutex_t ptmutex_queue;
-#define ENTER_QUEUE_CR pthread_mutex_lock(&ptmutex_queue)
-#define LEAVE_QUEUE_CR pthread_mutex_unlock(&ptmutex_queue)
-
-pthread_mutex_t ptmutex_recv;
-#define ENTER_RECV_CR pthread_mutex_lock(&ptmutex_recv)
-#define LEAVE_RECV_CR pthread_mutex_unlock(&ptmutex_recv)
-
-/* A message queue entry. We allocate one of these for every request we send.
- * Asynchronous reply reception also used one of these.
- */
-typedef struct bsq_t_struct {
-    struct bsq_t_struct *prev;
-    struct bsq_t_struct *next;
-    int status;
-    int server;
-    int length;
-    struct msghdr msghdr;
-    struct iovec iov[2];
-    int tid;
-    struct timeval tv_sent;
-    bshdr_t message;
-    void *block;
-} bsq_t;
-
-#define BSQ_STATUS_MATCHED 1
-
-pthread_mutex_t ptmutex_luid;
-#define ENTER_LUID_CR pthread_mutex_lock(&ptmutex_luid)
-#define LEAVE_LUID_CR pthread_mutex_unlock(&ptmutex_luid)
-
-static u64 luid_cnt = 0x1000ULL;
-u64 new_luid(void) {
-    u64 luid;
-    ENTER_LUID_CR;
-    luid = luid_cnt++;
-    LEAVE_LUID_CR;
-    return luid;
-}
-
-/* Queue of outstanding requests.
- */
-bsq_t *bs_head = NULL;
-bsq_t *bs_tail = NULL;
-int bs_qlen = 0;
-
-/*
- */
-void queuedebug(char *msg) {
-    bsq_t *q;
-    ENTER_QUEUE_CR;
-    fprintf(stderr, "Q: %s len=%u\n", msg, bs_qlen);
-    for (q = bs_head; q; q = q->next) {
-        fprintf(stderr, "  luid=%016llx server=%u\n",
-                q->message.luid, q->server);
-    }
-    LEAVE_QUEUE_CR;
-}
-
-int enqueue(bsq_t *qe) {
-    ENTER_QUEUE_CR;
-    qe->next = NULL;
-    qe->prev = bs_tail;
-    if (!bs_head)
-        bs_head = qe;
-    else
-        bs_tail->next = qe;
-    bs_tail = qe;
-    bs_qlen++;
-    LEAVE_QUEUE_CR;
-#ifdef BSDEBUG
-    queuedebug("enqueue");
-#endif
-    return 0;
-}
-
-int dequeue(bsq_t *qe) {
-    bsq_t *q;
-    ENTER_QUEUE_CR;
-    for (q = bs_head; q; q = q->next) {
-        if (q == qe) {
-            if (q->prev)
-                q->prev->next = q->next;
-            else 
-                bs_head = q->next;
-            if (q->next)
-                q->next->prev = q->prev;
-            else
-                bs_tail = q->prev;
-            bs_qlen--;
-            goto found;
-        }
-    }
-
-    LEAVE_QUEUE_CR;
-#ifdef BSDEBUG
-    queuedebug("dequeue not found");
-#endif
-    return 0;
-
-    found:
-    LEAVE_QUEUE_CR;
-#ifdef BSDEBUG
-    queuedebug("dequeue not found");
-#endif
-    return 1;
-}
-
-bsq_t *queuesearch(bsq_t *qe) {
-    bsq_t *q;
-    ENTER_QUEUE_CR;
-    for (q = bs_head; q; q = q->next) {
-        if ((qe->server == q->server) &&
-            (qe->message.operation == q->message.operation) &&
-            (qe->message.luid == q->message.luid)) {
-
-            if ((q->message.operation == BSOP_READBLOCK) &&
-                ((q->message.flags & BSOP_FLAG_ERROR) == 0)) {
-                q->block = qe->block;
-                qe->block = NULL;
-            }
-            q->length = qe->length;
-            q->message.flags = qe->message.flags;
-            q->message.id = qe->message.id;
-            q->status |= BSQ_STATUS_MATCHED;
-
-            if (q->prev)
-                q->prev->next = q->next;
-            else 
-                bs_head = q->next;
-            if (q->next)
-                q->next->prev = q->prev;
-            else
-                bs_tail = q->prev;
-            q->next = NULL;
-            q->prev = NULL;
-            bs_qlen--;
-            goto found;
-        }
-    }
-
-    LEAVE_QUEUE_CR;
-#ifdef BSDEBUG
-    queuedebug("queuesearch not found");
-#endif
-    return NULL;
-
-    found:
-    LEAVE_QUEUE_CR;
-#ifdef BSDEBUG
-    queuedebug("queuesearch found");
-#endif
-    return q;
-}
-
-/*****************************************************************************
- * Network communication                                                     *
- *****************************************************************************/
-
-int send_message(bsq_t *qe) {
-    int rc;
-
-    qe->msghdr.msg_name = (void *)&(bsservers[qe->server].sin);
-    qe->msghdr.msg_namelen = sizeof(struct sockaddr_in);
-    qe->msghdr.msg_iov = qe->iov;
-    if (qe->block)
-        qe->msghdr.msg_iovlen = 2;
-    else
-        qe->msghdr.msg_iovlen = 1;
-    qe->msghdr.msg_control = NULL;
-    qe->msghdr.msg_controllen = 0;
-    qe->msghdr.msg_flags = 0;
-
-    qe->iov[0].iov_base = (void *)&(qe->message);
-    qe->iov[0].iov_len = MSGBUFSIZE_ID;
-
-    if (qe->block) {
-        qe->iov[1].iov_base = qe->block;
-        qe->iov[1].iov_len = BLOCK_SIZE;
-    }
-
-    qe->message.luid = new_luid();
-
-    qe->status = 0;
-    qe->tid = (int)pthread_getspecific(tid_key);
-    if (enqueue(qe) < 0) {
-        fprintf(stderr, "Error enqueuing request.\n");
-        return -1;
-    }
-
-    gettimeofday(&(qe->tv_sent), NULL);
-    DB("send_message to %d luid=%016llx\n", qe->server, qe->message.luid);
-    rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
-    //rc = sendto(bssock, (void *)&(qe->message), qe->length, 0,
-    //           (struct sockaddr *)&(bsservers[qe->server].sin),
-    //           sizeof(struct sockaddr_in));
-    if (rc < 0)
-        return rc;
-
-    return rc;
-}
-
-int recv_message(bsq_t *qe) {
-    struct sockaddr_in from;
-    //int flen = sizeof(from);
-    int rc;
-
-    qe->msghdr.msg_name = &from;
-    qe->msghdr.msg_namelen = sizeof(struct sockaddr_in);
-    qe->msghdr.msg_iov = qe->iov;
-    if (qe->block)
-        qe->msghdr.msg_iovlen = 2;
-    else
-        qe->msghdr.msg_iovlen = 1;
-    qe->msghdr.msg_control = NULL;
-    qe->msghdr.msg_controllen = 0;
-    qe->msghdr.msg_flags = 0;
-
-    qe->iov[0].iov_base = (void *)&(qe->message);
-    qe->iov[0].iov_len = MSGBUFSIZE_ID;
-    if (qe->block) {
-        qe->iov[1].iov_base = qe->block;
-        qe->iov[1].iov_len = BLOCK_SIZE;
-    }
-
-    rc = recvmsg(bssock, &(qe->msghdr), 0);
-
-    //return recvfrom(bssock, (void *)&(qe->message), sizeof(bsmsg_t), 0,
-    //               (struct sockaddr *)&from, &flen);
-    return rc;
-}
-
-int get_server_number(struct sockaddr_in *sin) {
-    int i;
-
-#ifdef BSDEBUG2
-    fprintf(stderr,
-            "get_server_number(%u.%u.%u.%u/%u)\n",
-            (unsigned int)sin->sin_addr.s_addr & 0xff,
-            ((unsigned int)sin->sin_addr.s_addr >> 8) & 0xff,
-            ((unsigned int)sin->sin_addr.s_addr >> 16) & 0xff,
-            ((unsigned int)sin->sin_addr.s_addr >> 24) & 0xff,
-            (unsigned int)sin->sin_port);
-#endif
-
-    for (i = 0; i < MAX_SERVERS; i++) {
-        if (bsservers[i].hostname) {
-#ifdef BSDEBUG2
-            fprintf(stderr,
-                    "get_server_number check %u.%u.%u.%u/%u\n",
-                    (unsigned int)bsservers[i].sin.sin_addr.s_addr&0xff,
-                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 8)&0xff,
-                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 
16)&0xff,
-                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 
24)&0xff,
-                    (unsigned int)bsservers[i].sin.sin_port);
-#endif
-            if ((sin->sin_family == bsservers[i].sin.sin_family) &&
-                (sin->sin_port == bsservers[i].sin.sin_port) &&
-                (memcmp((void *)&(sin->sin_addr),
-                        (void *)&(bsservers[i].sin.sin_addr),
-                        sizeof(struct in_addr)) == 0)) {
-                return i;
-            }
-        }        
-    }
-
-    return -1;
-}
-
-void *rx_buffer = NULL;
-bsq_t rx_qe;
-bsq_t *recv_any(void) {
-    struct sockaddr_in from;
-    int rc;
-    
-    DB("ENTER recv_any\n");
-
-    rx_qe.msghdr.msg_name = &from;
-    rx_qe.msghdr.msg_namelen = sizeof(struct sockaddr_in);
-    rx_qe.msghdr.msg_iov = rx_qe.iov;
-    if (!rx_buffer) {
-        rx_buffer = malloc(BLOCK_SIZE);
-        if (!rx_buffer) {
-            perror("recv_any malloc");
-            return NULL;
-        }
-    }
-    rx_qe.block = rx_buffer;
-    rx_buffer = NULL;
-    rx_qe.msghdr.msg_iovlen = 2;
-    rx_qe.msghdr.msg_control = NULL;
-    rx_qe.msghdr.msg_controllen = 0;
-    rx_qe.msghdr.msg_flags = 0;
-    
-    rx_qe.iov[0].iov_base = (void *)&(rx_qe.message);
-    rx_qe.iov[0].iov_len = MSGBUFSIZE_ID;
-    rx_qe.iov[1].iov_base = rx_qe.block;
-    rx_qe.iov[1].iov_len = BLOCK_SIZE;
-
-    rc = recvmsg(bssock, &(rx_qe.msghdr), 0);
-    if (rc < 0) {
-        perror("recv_any");
-        return NULL;
-    }
-
-    rx_qe.length = rc;    
-    rx_qe.server = get_server_number(&from);
-
-    DB("recv_any from %d luid=%016llx len=%u\n",
-       rx_qe.server, rx_qe.message.luid, rx_qe.length);
-
-    return &rx_qe;
-}
-
-void recv_recycle_buffer(bsq_t *q) {
-    if (q->block) {
-        rx_buffer = q->block;
-        q->block = NULL;
-    }
-}
-
-// cycle through reading any incoming, searching for a match in the
-// queue, until we have all we need.
-int wait_recv(bsq_t **reqs, int numreqs) {
-    bsq_t *q, *m;
-    unsigned int x, i;
-    int tid = (int)pthread_getspecific(tid_key);
-
-    DB("ENTER wait_recv %u\n", numreqs);
-
-    checkmatch:
-    x = 0xffffffff;
-    for (i = 0; i < numreqs; i++) {
-        x &= reqs[i]->status;
-    }
-    if ((x & BSQ_STATUS_MATCHED)) {
-        DB("LEAVE wait_recv\n");
-        return numreqs;
-    }
-
-    RECV_AWAIT(tid);
-
-    /*
-    rxagain:
-    ENTER_RECV_CR;
-    q = recv_any();
-    LEAVE_RECV_CR;
-    if (!q)
-        return -1;
-
-    m = queuesearch(q);
-    recv_recycle_buffer(q);
-    if (!m) {
-        fprintf(stderr, "Unmatched RX\n");
-        goto rxagain;
-    }
-    */
-
-    goto checkmatch;
-
-}
-
-/* retry
- */
-static int retry_count = 0;
-int retry(bsq_t *qe)
-{
-    int rc;
-    gettimeofday(&(qe->tv_sent), NULL);
-    DB("retry to %d luid=%016llx\n", qe->server, qe->message.luid);
-    retry_count++;
-    rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
-    if (rc < 0)
-        return rc;
-    return 0;
-}
-
-/* queue runner
- */
-void *queue_runner(void *arg)
-{
-    for (;;) {
-        struct timeval now;
-        long long nowus, sus;
-        bsq_t *q;
-        int r;
-
-        sleep(1);
-
-        gettimeofday(&now, NULL);
-        nowus = now.tv_usec + now.tv_sec * 1000000;
-        ENTER_QUEUE_CR;
-        r = retry_count;
-        for (q = bs_head; q; q = q->next) {
-            sus = q->tv_sent.tv_usec + q->tv_sent.tv_sec * 1000000;
-            if ((nowus - sus) > RETRY_TIMEOUT) {
-                if (retry(q) < 0) {
-                    fprintf(stderr, "Error on sendmsg retry.\n");
-                }
-            }
-        }
-        if (r != retry_count) {
-            fprintf(stderr, "RETRIES: %u %u\n", retry_count - r, retry_count);
-        }
-        LEAVE_QUEUE_CR;
-    }
-}
-
-/* receive loop
- */
-void *receive_loop(void *arg)
-{
-    bsq_t *q, *m;
-
-    for(;;) {
-        q = recv_any();
-        if (!q) {
-            fprintf(stderr, "recv_any error\n");
-        }
-        else {
-            m = queuesearch(q);
-            recv_recycle_buffer(q);
-            if (!m) {
-                fprintf(stderr, "Unmatched RX\n");
-            }
-            else {
-                DB("RX MATCH");
-                RECV_NOTIFY(m->tid);
-            }
-        }
-    }
-}
-pthread_t pthread_recv;
-
-/*****************************************************************************
- * Reading                                                                   *
- *****************************************************************************/
-
-void *readblock_indiv(int server, u64 id) {
-    void *block;
-    bsq_t *qe;
-    int len, rc;
-
-    qe = (bsq_t *)malloc(sizeof(bsq_t));
-    if (!qe) {
-        perror("readblock qe malloc");
-        return NULL;
-    }
-    qe->block = NULL;
-    
-    /*
-    qe->block = malloc(BLOCK_SIZE);
-    if (!qe->block) {
-        perror("readblock qe malloc");
-        free((void *)qe);
-        return NULL;
-    }
-    */
-
-    qe->server = server;
-
-    qe->message.operation = BSOP_READBLOCK;
-    qe->message.flags = 0;
-    qe->message.id = id;
-    qe->length = MSGBUFSIZE_ID;
-
-    if (send_message(qe) < 0) {
-        perror("readblock sendto");
-        goto err;
-    }
-    
-    /*len = recv_message(qe);
-    if (len < 0) {
-        perror("readblock recv");
-        goto err;
-    }*/
-
-    rc = wait_recv(&qe, 1);
-    if (rc < 0) {
-        perror("readblock recv");
-        goto err;
-    }
-
-    if ((qe->message.flags & BSOP_FLAG_ERROR)) {
-        fprintf(stderr, "readblock server error\n");
-        goto err;
-    }
-    if (qe->length < MSGBUFSIZE_BLOCK) {
-        fprintf(stderr, "readblock recv short (%u)\n", len);
-        goto err;
-    }
-    /* if ((block = malloc(BLOCK_SIZE)) == NULL) {
-        perror("readblock malloc");
-        goto err;
-    }
-    memcpy(block, qe->message.block, BLOCK_SIZE);
-    */    
-    block = qe->block;
-
-    free((void *)qe);
-    return block;
-
-    err:
-    if (qe->block)
-        free(qe->block);
-    free((void *)qe);
-    return NULL;
-}
-
-/**
- * readblock: read a block from disk
- *   @id: block id to read
- *
- *   @return: pointer to block, NULL on error
- */
-void *readblock(u64 id) {
-    int map = (int)BSID_MAP(id);
-    u64 xid;
-    static int i = CLUSTER_MAX_REPLICAS - 1;
-    void *block = NULL;
-
-    /* special case for the "superblock" just use the first block on the
-     * first replica. (extend to blocks < 6 for vdi bug)
-     */
-    if (id < 6) {
-        block = readblock_indiv(bsclusters[map].servers[0], id);
-        goto out;
-    }
-
-    i++;
-    if (i >= CLUSTER_MAX_REPLICAS)
-        i = 0;
-    switch (i) {
-    case 0:
-        xid = BSID_REPLICA0(id);
-        break;
-    case 1:
-        xid = BSID_REPLICA1(id);
-        break;
-    case 2:
-        xid = BSID_REPLICA2(id);
-        break;
-    }
-    
-    block = readblock_indiv(bsclusters[map].servers[i], xid);
-
-    out:
-#ifdef BSDEBUG
-    if (block)
-        fprintf(stderr, "READ:  %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
-                id,
-                (unsigned int)((unsigned char *)block)[0],
-                (unsigned int)((unsigned char *)block)[1],
-                (unsigned int)((unsigned char *)block)[2],
-                (unsigned int)((unsigned char *)block)[3],
-                (unsigned int)((unsigned char *)block)[4],
-                (unsigned int)((unsigned char *)block)[5],
-                (unsigned int)((unsigned char *)block)[6],
-                (unsigned int)((unsigned char *)block)[7]);
-    else
-        fprintf(stderr, "READ:  %016llx NULL\n", id);
-#endif
-    return block;
-}
-
-/*****************************************************************************
- * Writing                                                                   *
- *****************************************************************************/
-
-bsq_t *writeblock_indiv(int server, u64 id, void *block) {
-
-    bsq_t *qe;
-    int len;
-
-    qe = (bsq_t *)malloc(sizeof(bsq_t));
-    if (!qe) {
-        perror("writeblock qe malloc");
-        goto err;
-    }
-    qe->server = server;
-
-    qe->message.operation = BSOP_WRITEBLOCK;
-    qe->message.flags = 0;
-    qe->message.id = id;
-    //memcpy(qe->message.block, block, BLOCK_SIZE);
-    qe->block = block;
-    qe->length = MSGBUFSIZE_BLOCK;
-
-    if (send_message(qe) < 0) {
-        perror("writeblock sendto");
-        goto err;
-    }
-
-    return qe;
-
-    err:
-    free((void *)qe);
-    return NULL;
-}
-    
-
-/**
- * writeblock: write an existing block to disk
- *   @id: block id
- *   @block: pointer to block
- *
- *   @return: zero on success, -1 on failure
- */
-int writeblock(u64 id, void *block) {
-    
-    int map = (int)BSID_MAP(id);
-    int rep0 = bsclusters[map].servers[0];
-    int rep1 = bsclusters[map].servers[1];
-    int rep2 = bsclusters[map].servers[2];
-    bsq_t *reqs[3];
-    int rc;
-
-    reqs[0] = reqs[1] = reqs[2] = NULL;
-
-#ifdef BSDEBUG
-    fprintf(stderr,
-            "WRITE: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
-            id,
-            (unsigned int)((unsigned char *)block)[0],
-            (unsigned int)((unsigned char *)block)[1],
-            (unsigned int)((unsigned char *)block)[2],
-            (unsigned int)((unsigned char *)block)[3],
-            (unsigned int)((unsigned char *)block)[4],
-            (unsigned int)((unsigned char *)block)[5],
-            (unsigned int)((unsigned char *)block)[6],
-            (unsigned int)((unsigned char *)block)[7]);
-#endif
-
-    /* special case for the "superblock" just use the first block on the
-     * first replica. (extend to blocks < 6 for vdi bug)
-     */
-    if (id < 6) {
-        reqs[0] = writeblock_indiv(rep0, id, block);
-        if (!reqs[0])
-            return -1;
-        rc = wait_recv(reqs, 1);
-        return rc;
-    }
-
-    reqs[0] = writeblock_indiv(rep0, BSID_REPLICA0(id), block);
-    if (!reqs[0])
-        goto err;
-    reqs[1] = writeblock_indiv(rep1, BSID_REPLICA1(id), block);
-    if (!reqs[1])
-        goto err;
-    reqs[2] = writeblock_indiv(rep2, BSID_REPLICA2(id), block);
-    if (!reqs[2])
-        goto err;
-
-    rc = wait_recv(reqs, 3);
-    if (rc < 0) {
-        perror("writeblock recv");
-        goto err;
-    }
-    if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) {
-        fprintf(stderr, "writeblock server0 error\n");
-        goto err;
-    }
-    if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) {
-        fprintf(stderr, "writeblock server1 error\n");
-        goto err;
-    }
-    if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) {
-        fprintf(stderr, "writeblock server2 error\n");
-        goto err;
-    }
-
-
-    free((void *)reqs[0]);
-    free((void *)reqs[1]);
-    free((void *)reqs[2]);
-    return 0;
-
-    err:
-    if (reqs[0]) {
-        dequeue(reqs[0]);
-        free((void *)reqs[0]);
-    }
-    if (reqs[1]) {
-        dequeue(reqs[1]);
-        free((void *)reqs[1]);
-    }
-    if (reqs[2]) {
-        dequeue(reqs[2]);
-        free((void *)reqs[2]);
-    }
-    return -1;
-}
-
-/*****************************************************************************
- * Allocation                                                                *
- *****************************************************************************/
-
-/**
- * allocblock: write a new block to disk
- *   @block: pointer to block
- *
- *   @return: new id of block on disk
- */
-u64 allocblock(void *block) {
-    return allocblock_hint(block, 0);
-}
-
-bsq_t *allocblock_hint_indiv(int server, void *block, u64 hint) {
-    bsq_t *qe;
-    int len;
-
-    qe = (bsq_t *)malloc(sizeof(bsq_t));
-    if (!qe) {
-        perror("allocblock_hint qe malloc");
-        goto err;
-    }
-    qe->server = server;
-
-    qe->message.operation = BSOP_ALLOCBLOCK;
-    qe->message.flags = 0;
-    qe->message.id = hint;
-    //memcpy(qe->message.block, block, BLOCK_SIZE);
-    qe->block = block;
-    qe->length = MSGBUFSIZE_BLOCK;
-
-    if (send_message(qe) < 0) {
-        perror("allocblock_hint sendto");
-        goto err;
-    }
-    
-    return qe;
-
-    err:
-    free((void *)qe);
-    return NULL;
-}
-
-/**
- * allocblock_hint: write a new block to disk
- *   @block: pointer to block
- *   @hint: allocation hint
- *
- *   @return: new id of block on disk
- */
-u64 allocblock_hint(void *block, u64 hint) {
-    int map = (int)hint;
-    int rep0 = bsclusters[map].servers[0];
-    int rep1 = bsclusters[map].servers[1];
-    int rep2 = bsclusters[map].servers[2];
-    bsq_t *reqs[3];
-    int rc;
-    u64 id0, id1, id2;
-
-    reqs[0] = reqs[1] = reqs[2] = NULL;
-
-    DB("ENTER allocblock\n");
-
-    reqs[0] = allocblock_hint_indiv(rep0, block, hint);
-    if (!reqs[0])
-        goto err;
-    reqs[1] = allocblock_hint_indiv(rep1, block, hint);
-    if (!reqs[1])
-        goto err;
-    reqs[2] = allocblock_hint_indiv(rep2, block, hint);
-    if (!reqs[2])
-        goto err;
-
-    rc = wait_recv(reqs, 3);
-    if (rc < 0) {
-        perror("allocblock recv");
-        goto err;
-    }
-    if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) {
-        fprintf(stderr, "allocblock server0 error\n");
-        goto err;
-    }
-    if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) {
-        fprintf(stderr, "allocblock server1 error\n");
-        goto err;
-    }
-    if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) {
-        fprintf(stderr, "allocblock server2 error\n");
-        goto err;
-    }
-
-    id0 = reqs[0]->message.id;
-    id1 = reqs[1]->message.id;
-    id2 = reqs[2]->message.id;
-
-#ifdef BSDEBUG
-    fprintf(stderr, "ALLOC: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
-            BSID(map, id0, id1, id2),
-            (unsigned int)((unsigned char *)block)[0],
-            (unsigned int)((unsigned char *)block)[1],
-            (unsigned int)((unsigned char *)block)[2],
-            (unsigned int)((unsigned char *)block)[3],
-            (unsigned int)((unsigned char *)block)[4],
-            (unsigned int)((unsigned char *)block)[5],
-            (unsigned int)((unsigned char *)block)[6],
-            (unsigned int)((unsigned char *)block)[7]);
-#endif
-    
-    free((void *)reqs[0]);
-    free((void *)reqs[1]);
-    free((void *)reqs[2]);
-    return BSID(map, id0, id1, id2);
-
-    err:
-    if (reqs[0]) {
-        dequeue(reqs[0]);
-        free((void *)reqs[0]);
-    }
-    if (reqs[1]) {
-        dequeue(reqs[1]);
-        free((void *)reqs[1]);
-    }
-    if (reqs[2]) {
-        dequeue(reqs[2]);
-        free((void *)reqs[2]);
-    }
-    return 0;
-}
-
-#else /* /BLOCKSTORE_REMOTE */
-
-/*****************************************************************************
- * Local storage version                                                     *
- *****************************************************************************/
- 
-/**
- * readblock: read a block from disk
- *   @id: block id to read
- *
- *   @return: pointer to block, NULL on error
- */
-
-void *readblock(u64 id) {
-    void *block;
-    int block_fp;
-   
-//printf("readblock(%llu)\n", id); 
-    block_fp = open("blockstore.dat", O_RDONLY | O_CREAT | O_LARGEFILE, 0644);
-
-    if (block_fp < 0) {
-        perror("open");
-        return NULL;
-    }
-    
-    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
-        printf ("%Ld ", id);
-        printf ("%Ld\n", (id - 1) * BLOCK_SIZE);
-        perror("readblock lseek");
-        goto err;
-    }
-    if ((block = malloc(BLOCK_SIZE)) == NULL) {
-        perror("readblock malloc");
-        goto err;
-    }
-    if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
-        perror("readblock read");
-        free(block);
-        goto err;
-    }
-    close(block_fp);
-    return block;
-    
-err:
-    close(block_fp);
-    return NULL;
-}
-
-/**
- * writeblock: write an existing block to disk
- *   @id: block id
- *   @block: pointer to block
- *
- *   @return: zero on success, -1 on failure
- */
-int writeblock(u64 id, void *block) {
-    
-    int block_fp;
-    
-    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
-
-    if (block_fp < 0) {
-        perror("open");
-        return -1;
-    }
-
-    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
-        perror("writeblock lseek");
-        goto err;
-    }
-    if (write(block_fp, block, BLOCK_SIZE) < 0) {
-        perror("writeblock write");
-        goto err;
-    }
-    close(block_fp);
-    return 0;
-
-err:
-    close(block_fp);
-    return -1;
-}
-
-/**
- * allocblock: write a new block to disk
- *   @block: pointer to block
- *
- *   @return: new id of block on disk
- */
-
-u64 allocblock(void *block) {
-    u64 lb;
-    off64_t pos;
-    int block_fp;
-    
-    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
-
-    if (block_fp < 0) {
-        perror("open");
-        return 0;
-    }
-
-    pos = lseek64(block_fp, 0, SEEK_END);
-    if (pos == (off64_t)-1) {
-        perror("allocblock lseek");
-        goto err;
-    }
-    if (pos % BLOCK_SIZE != 0) {
-        fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE);
-        goto err;
-    }
-    if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
-        perror("allocblock write");
-        goto err;
-    }
-    lb = pos / BLOCK_SIZE + 1;
-//printf("alloc(%Ld)\n", lb);
-    close(block_fp);
-    return lb;
-    
-err:
-    close(block_fp);
-    return 0;
-    
-}
-
-/**
- * allocblock_hint: write a new block to disk
- *   @block: pointer to block
- *   @hint: allocation hint
- *
- *   @return: new id of block on disk
- */
-u64 allocblock_hint(void *block, u64 hint) {
-    return allocblock(block);
-}
-
-#endif /* BLOCKSTORE_REMOTE */
-
-/*****************************************************************************
- * Memory management                                                         *
- *****************************************************************************/
-
-/**
- * newblock: get a new in-memory block set to zeros
- *
- *   @return: pointer to new block, NULL on error
- */
-void *newblock() {
-    void *block = malloc(BLOCK_SIZE);
-    if (block == NULL) {
-        perror("newblock");
-        return NULL;
-    }
-    memset(block, 0, BLOCK_SIZE);
-    return block;
-}
-
-
-/**
- * freeblock: unallocate an in-memory block
- *   @id: block id (zero if this is only in-memory)
- *   @block: block to be freed
- */
-void freeblock(void *block) {
-    if (block != NULL)
-        free(block);
-}
-
-static freeblock_t *new_freeblock(void)
-{
-    freeblock_t *fb;
-    
-    fb = newblock();
-    
-    if (fb == NULL) return NULL;
-    
-    fb->magic = FREEBLOCK_MAGIC;
-    fb->next  = 0ULL;
-    fb->count = 0ULL;
-    memset(fb->list, 0, sizeof fb->list);
-    
-    return fb;
-}
-
-void releaseblock(u64 id)
-{
-    blockstore_super_t *bs_super;
-    freeblock_t *fl_current;
-    
-    /* get superblock */
-    bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
-    
-    /* get freeblock_current */
-    if (bs_super->freelist_current == 0ULL) 
-    {
-        fl_current = new_freeblock();
-        bs_super->freelist_current = allocblock(fl_current);
-        writeblock(BLOCKSTORE_SUPER, bs_super);
-    } else {
-        fl_current = readblock(bs_super->freelist_current);
-    }
-    
-    /* if full, chain to superblock and allocate new current */
-    
-    if (fl_current->count == FREEBLOCK_SIZE) {
-        fl_current->next = bs_super->freelist_full;
-        writeblock(bs_super->freelist_current, fl_current);
-        bs_super->freelist_full = bs_super->freelist_current;
-        freeblock(fl_current);
-        fl_current = new_freeblock();
-        bs_super->freelist_current = allocblock(fl_current);
-        writeblock(BLOCKSTORE_SUPER, bs_super);
-    }
-    
-    /* append id to current */
-    fl_current->list[fl_current->count++] = id;
-    writeblock(bs_super->freelist_current, fl_current);
-    
-    freeblock(fl_current);
-    freeblock(bs_super);
-    
-    
-}
-
-/* freelist debug functions: */
-void freelist_count(int print_each)
-{
-    blockstore_super_t *bs_super;
-    freeblock_t *fb;
-    u64 total = 0, next;
-    
-    bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
-    
-    if (bs_super->freelist_current == 0ULL) {
-        printf("freelist is empty!\n");
-        return;
-    }
-    
-    fb = readblock(bs_super->freelist_current);
-    printf("%Ld entires on current.\n", fb->count);
-    total += fb->count;
-    if (print_each == 1)
-    {
-        int i;
-        for (i=0; i< fb->count; i++)
-            printf("  %Ld\n", fb->list[i]);
-    }
-    
-    freeblock(fb);
-    
-    if (bs_super->freelist_full == 0ULL) {
-        printf("freelist_full is empty!\n");
-        return;
-    }
-    
-    next = bs_super->freelist_full;
-    for (;;) {
-        fb = readblock(next);
-        total += fb->count;
-        if (print_each == 1)
-        {
-            int i;
-            for (i=0; i< fb->count; i++)
-                printf("  %Ld\n", fb->list[i]);
-        }
-        next = fb->next;
-        freeblock(fb);
-        if (next == 0ULL) break;
-    }
-    printf("Total of %Ld ids on freelist.\n", total);
-}
-
-/*****************************************************************************
- * Initialisation                                                            *
- *****************************************************************************/
-
-int __init_blockstore(void)
-{
-    int i;
-    blockstore_super_t *bs_super;
-    u64 ret;
-    int block_fp;
-    
-#ifdef BLOCKSTORE_REMOTE
-    struct hostent *addr;
-
-    pthread_mutex_init(&ptmutex_queue, NULL);
-    pthread_mutex_init(&ptmutex_luid, NULL);
-    pthread_mutex_init(&ptmutex_recv, NULL);
-    /*pthread_mutex_init(&ptmutex_notify, NULL);*/
-    for (i = 0; i <= READ_POOL_SIZE; i++) {
-        pool_thread[i].newdata = 0;
-        pthread_mutex_init(&(pool_thread[i].ptmutex), NULL);
-        pthread_cond_init(&(pool_thread[i].ptcv), NULL);
-    }
-
-    bsservers[0].hostname = "firebug.cl.cam.ac.uk";
-    bsservers[1].hostname = "planb.cl.cam.ac.uk";
-    bsservers[2].hostname = "simcity.cl.cam.ac.uk";
-    bsservers[3].hostname = NULL/*"gunfighter.cl.cam.ac.uk"*/;
-    bsservers[4].hostname = NULL/*"galaxian.cl.cam.ac.uk"*/;
-    bsservers[5].hostname = NULL/*"firetrack.cl.cam.ac.uk"*/;
-    bsservers[6].hostname = NULL/*"funfair.cl.cam.ac.uk"*/;
-    bsservers[7].hostname = NULL/*"felix.cl.cam.ac.uk"*/;
-    bsservers[8].hostname = NULL;
-    bsservers[9].hostname = NULL;
-    bsservers[10].hostname = NULL;
-    bsservers[11].hostname = NULL;
-    bsservers[12].hostname = NULL;
-    bsservers[13].hostname = NULL;
-    bsservers[14].hostname = NULL;
-    bsservers[15].hostname = NULL;
-
-    for (i = 0; i < MAX_SERVERS; i++) {
-        if (!bsservers[i].hostname)
-            continue;
-        addr = gethostbyname(bsservers[i].hostname);
-        if (!addr) {
-            perror("bad hostname");
-            return -1;
-        }
-        bsservers[i].sin.sin_family = addr->h_addrtype;
-        bsservers[i].sin.sin_port = htons(BLOCKSTORED_PORT);
-        bsservers[i].sin.sin_addr.s_addr = 
-            ((struct in_addr *)(addr->h_addr))->s_addr;
-    }
-
-    /* Cluster map
-     */
-    bsclusters[0].servers[0] = 0;
-    bsclusters[0].servers[1] = 1;
-    bsclusters[0].servers[2] = 2;
-    bsclusters[1].servers[0] = 1;
-    bsclusters[1].servers[1] = 2;
-    bsclusters[1].servers[2] = 3;
-    bsclusters[2].servers[0] = 2;
-    bsclusters[2].servers[1] = 3;
-    bsclusters[2].servers[2] = 4;
-    bsclusters[3].servers[0] = 3;
-    bsclusters[3].servers[1] = 4;
-    bsclusters[3].servers[2] = 5;
-    bsclusters[4].servers[0] = 4;
-    bsclusters[4].servers[1] = 5;
-    bsclusters[4].servers[2] = 6;
-    bsclusters[5].servers[0] = 5;
-    bsclusters[5].servers[1] = 6;
-    bsclusters[5].servers[2] = 7;
-    bsclusters[6].servers[0] = 6;
-    bsclusters[6].servers[1] = 7;
-    bsclusters[6].servers[2] = 0;
-    bsclusters[7].servers[0] = 7;
-    bsclusters[7].servers[1] = 0;
-    bsclusters[7].servers[2] = 1;
-
-    /* Local socket set up
-     */
-    bssock = socket(AF_INET, SOCK_DGRAM, 0);
-    if (bssock < 0) {
-        perror("Bad socket");
-        return -1;
-    }
-    memset(&sin_local, 0, sizeof(sin_local));
-    sin_local.sin_family = AF_INET;
-    sin_local.sin_port = htons(BLOCKSTORED_PORT);
-    sin_local.sin_addr.s_addr = htonl(INADDR_ANY);
-    if (bind(bssock, (struct sockaddr *)&sin_local, sizeof(sin_local)) < 0) {
-        perror("bind");
-        close(bssock);
-        return -1;
-    }
-
-    pthread_create(&pthread_recv, NULL, receive_loop, NULL);
-    pthread_create(&pthread_recv, NULL, queue_runner, NULL);
-
-#else /* /BLOCKSTORE_REMOTE */
-    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
-
-    if (block_fp < 0) {
-        perror("open");
-        return -1;
-        exit(-1);
-    }
-    
-    if (lseek(block_fp, 0, SEEK_END) == 0) {
-        bs_super = newblock();
-        bs_super->magic            = BLOCKSTORE_MAGIC;
-        bs_super->freelist_full    = 0LL;
-        bs_super->freelist_current = 0LL;
-        
-        ret = allocblock(bs_super);
-        
-        freeblock(bs_super);
-    } else {
-        bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
-        if (bs_super->magic != BLOCKSTORE_MAGIC)
-        {
-            printf("BLOCKSTORE IS CORRUPT! (no magic in superblock!)\n");
-            exit(-1);
-        }
-        freeblock(bs_super);
-    }
-        
-    close(block_fp);
-        
-#endif /*  BLOCKSTORE_REMOTE */   
-    return 0;
-}
-
-void __exit_blockstore(void)
-{
-    int i;
-#ifdef BLOCKSTORE_REMOTE
-    pthread_mutex_destroy(&ptmutex_recv);
-    pthread_mutex_destroy(&ptmutex_luid);
-    pthread_mutex_destroy(&ptmutex_queue);
-    /*pthread_mutex_destroy(&ptmutex_notify);
-      pthread_cond_destroy(&ptcv_notify);*/
-    for (i = 0; i <= READ_POOL_SIZE; i++) {
-        pthread_mutex_destroy(&(pool_thread[i].ptmutex));
-        pthread_cond_destroy(&(pool_thread[i].ptcv));
-    }
-#endif
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/blockstore.h
--- a/tools/blktap/blockstore.h Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,134 +0,0 @@
-/**************************************************************************
- * 
- * blockstore.h
- *
- * Simple block store interface
- *
- */
- 
-#ifndef __BLOCKSTORE_H__
-#define __BLOCKSTORE_H__
-
-#include <netinet/in.h>
-#include <xc.h>
-
-#define BLOCK_SIZE  4096
-#define BLOCK_SHIFT   12
-#define BLOCK_MASK  0xfffffffffffff000LL
-
-/* XXX SMH: where is the below supposed to be defined???? */
-#ifndef SECTOR_SHIFT 
-#define SECTOR_SHIFT   9 
-#endif
-
-#define FREEBLOCK_SIZE  (BLOCK_SIZE / sizeof(u64)) - (3 * sizeof(u64))
-#define FREEBLOCK_MAGIC 0x0fee0fee0fee0feeULL
-
-typedef struct {
-    u64 magic;
-    u64 next;
-    u64 count;
-    u64 list[FREEBLOCK_SIZE];
-} freeblock_t; 
-
-#define BLOCKSTORE_MAGIC 0xaaaaaaa00aaaaaaaULL
-#define BLOCKSTORE_SUPER 1ULL
-
-typedef struct {
-    u64 magic;
-    u64 freelist_full;
-    u64 freelist_current;
-} blockstore_super_t;
-
-extern void *newblock();
-extern void *readblock(u64 id);
-extern u64 allocblock(void *block);
-extern u64 allocblock_hint(void *block, u64 hint);
-extern int writeblock(u64 id, void *block);
-
-/* Add this blockid to a freelist, to be recycled by the allocator. */
-extern void releaseblock(u64 id);
-
-/* this is a memory free() operation for block-sized allocations */
-extern void freeblock(void *block);
-extern int __init_blockstore(void);
-
-/* debug for freelist. */
-void freelist_count(int print_each);
-#define ALLOCFAIL (((u64)(-1)))
-
-/* Distribution
- */
-#define BLOCKSTORED_PORT 9346
-
-struct bshdr_t_struct {
-    u32            operation;
-    u32            flags;
-    u64            id;
-    u64            luid;
-} __attribute__ ((packed));
-typedef struct bshdr_t_struct bshdr_t;
-
-struct bsmsg_t_struct {
-    bshdr_t        hdr;
-    unsigned char  block[BLOCK_SIZE];
-} __attribute__ ((packed));
-
-typedef struct bsmsg_t_struct bsmsg_t;
-
-#define MSGBUFSIZE_OP    sizeof(u32)
-#define MSGBUFSIZE_FLAGS (sizeof(u32) + sizeof(u32))
-#define MSGBUFSIZE_ID    (sizeof(u32) + sizeof(u32) + sizeof(u64) + 
sizeof(u64))
-#define MSGBUFSIZE_BLOCK sizeof(bsmsg_t)
-
-#define BSOP_READBLOCK  0x01
-#define BSOP_WRITEBLOCK 0x02
-#define BSOP_ALLOCBLOCK 0x03
-#define BSOP_FREEBLOCK  0x04
-
-#define BSOP_FLAG_ERROR 0x01
-
-#define BS_ALLOC_SKIP 10
-#define BS_ALLOC_HACK
-
-/* Remote hosts and cluster map - XXX need to generalise
- */
-
-/*
-
-  Interim ID format is
-
-  63 60 59                40 39                20 19                 0
-  +----+--------------------+--------------------+--------------------+
-  |map | replica 2          | replica 1          | replica 0          |
-  +----+--------------------+--------------------+--------------------+
-
-  The map is an index into a table detailing which machines form the
-  cluster.
-
- */
-
-#define BSID_REPLICA0(_id) ((_id)&0xfffffULL)
-#define BSID_REPLICA1(_id) (((_id)>>20)&0xfffffULL)
-#define BSID_REPLICA2(_id) (((_id)>>40)&0xfffffULL)
-#define BSID_MAP(_id)      (((_id)>>60)&0xfULL)
-
-#define BSID(_map, _rep0, _rep1, _rep2) ((((u64)(_map))<<60) | \
-                                         (((u64)(_rep2))<<40) | \
-                                         (((u64)(_rep1))<<20) | ((u64)(_rep0)))
-
-typedef struct bsserver_t_struct {
-    char              *hostname;
-    struct sockaddr_in sin;
-} bsserver_t;
-
-#define MAX_SERVERS 16
-
-#define CLUSTER_MAX_REPLICAS 3
-typedef struct bscluster_t_struct {
-    int servers[CLUSTER_MAX_REPLICAS];
-} bscluster_t;
-
-#define MAX_CLUSTERS 16
-
-#endif /* __BLOCKSTORE_H__ */
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/blockstored.c
--- a/tools/blktap/blockstored.c        Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,276 +0,0 @@
-/**************************************************************************
- * 
- * blockstored.c
- *
- * Block store daemon.
- *
- */
-
-#include <fcntl.h>
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#include <netinet/in.h>
-#include <errno.h>
-#include "blockstore.h"
-
-//#define BSDEBUG
-
-int readblock_into(u64 id, void *block);
-
-int open_socket(u16 port) {
-    
-    struct sockaddr_in sn;
-    int sock;
-
-    sock = socket(AF_INET, SOCK_DGRAM, 0);
-    if (sock < 0) {
-        perror("Bad socket");
-        return -1;
-    }
-    memset(&sn, 0, sizeof(sn));
-    sn.sin_family = AF_INET;
-    sn.sin_port = htons(port);
-    sn.sin_addr.s_addr = htonl(INADDR_ANY);
-    if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) {
-        perror("bind");
-        close(sock);
-        return -1;
-    }
-
-    return sock;
-}
-
-static int block_fp = -1;
-static int bssock = -1;
-
-int send_reply(struct sockaddr_in *peer, void *buffer, int len) {
-
-    int rc;
-    
-#ifdef BSDEBUG
-    fprintf(stdout, "TX: %u bytes op=%u id=0x%llx\n",
-            len, ((bsmsg_t *)buffer)->hdr.operation, ((bsmsg_t 
*)buffer)->hdr.id);
-#endif
-    rc = sendto(bssock, buffer, len, 0, (struct sockaddr *)peer, 
sizeof(*peer));
-    if (rc < 0) {
-        perror("send_reply");
-        return 1;
-    }
-
-
-    return 0;
-}
-
-static bsmsg_t msgbuf;
-
-void service_loop(void) {
-
-    for (;;) {
-        int rc, len;
-        struct sockaddr_in from;
-        size_t slen = sizeof(from);
-        u64 bid;
-
-        len = recvfrom(bssock, (void *)&msgbuf, sizeof(msgbuf), 0,
-                       (struct sockaddr *)&from, &slen);
-
-        if (len < 0) {
-            perror("recvfrom");
-            continue;
-        }
-
-        if (len < MSGBUFSIZE_OP) {
-            fprintf(stderr, "Short packet.\n");
-            continue;
-        }
-
-#ifdef BSDEBUG
-        fprintf(stdout, "RX: %u bytes op=%u id=0x%llx\n",
-                len, msgbuf.hdr.operation, msgbuf.hdr.id);
-#endif
-
-        switch (msgbuf.hdr.operation) {
-        case BSOP_READBLOCK:
-            if (len < MSGBUFSIZE_ID) {
-                fprintf(stderr, "Short packet (readblock %u).\n", len);
-                continue;
-            }
-            rc = readblock_into(msgbuf.hdr.id, msgbuf.block);
-            if (rc < 0) {
-                fprintf(stderr, "readblock error\n");
-                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
-                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
-                continue;
-            }
-            msgbuf.hdr.flags = 0;
-            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_BLOCK);
-            break;
-        case BSOP_WRITEBLOCK:
-            if (len < MSGBUFSIZE_BLOCK) {
-                fprintf(stderr, "Short packet (writeblock %u).\n", len);
-                continue;
-            }
-            rc = writeblock(msgbuf.hdr.id, msgbuf.block);
-            if (rc < 0) {
-                fprintf(stderr, "writeblock error\n");
-                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
-                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
-                continue;
-            }
-            msgbuf.hdr.flags = 0;
-            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
-            break;
-        case BSOP_ALLOCBLOCK:
-            if (len < MSGBUFSIZE_BLOCK) {
-                fprintf(stderr, "Short packet (allocblock %u).\n", len);
-                continue;
-            }
-            bid = allocblock(msgbuf.block);
-            if (bid == ALLOCFAIL) {
-                fprintf(stderr, "allocblock error\n");
-                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
-                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
-                continue;
-            }
-            msgbuf.hdr.id = bid;
-            msgbuf.hdr.flags = 0;
-            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
-            break;
-        }
-
-    }
-}
- 
-/**
- * readblock: read a block from disk
- *   @id: block id to read
- *   @block: pointer to buffer to receive block
- *
- *   @return: 0 if OK, other on error
- */
-
-int readblock_into(u64 id, void *block) {
-    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
-        printf ("%Ld\n", (id - 1) * BLOCK_SIZE);
-        perror("readblock lseek");
-        return -1;
-    }
-    if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
-        perror("readblock read");
-        return -1;
-    }
-    return 0;
-}
-
-/**
- * writeblock: write an existing block to disk
- *   @id: block id
- *   @block: pointer to block
- *
- *   @return: zero on success, -1 on failure
- */
-int writeblock(u64 id, void *block) {
-    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
-        perror("writeblock lseek");
-        return -1;
-    }
-    if (write(block_fp, block, BLOCK_SIZE) < 0) {
-        perror("writeblock write");
-        return -1;
-    }
-    return 0;
-}
-
-/**
- * allocblock: write a new block to disk
- *   @block: pointer to block
- *
- *   @return: new id of block on disk
- */
-static u64 lastblock = 0;
-
-u64 allocblock(void *block) {
-    u64 lb;
-    off64_t pos;
-
-    retry:
-    pos = lseek64(block_fp, 0, SEEK_END);
-    if (pos == (off64_t)-1) {
-        perror("allocblock lseek");
-        return ALLOCFAIL;
-    }
-    if (pos % BLOCK_SIZE != 0) {
-        fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE);
-        return ALLOCFAIL;
-    }
-    if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
-        perror("allocblock write");
-        return ALLOCFAIL;
-    }
-    lb = pos / BLOCK_SIZE + 1;
-
-#ifdef BS_ALLOC_HACK
-    if (lb < BS_ALLOC_SKIP)
-        goto retry;
-#endif
-    
-    if (lb <= lastblock)
-        printf("[*** %Ld alredy allocated! ***]\n", lb);
-    
-    lastblock = lb;
-    return lb;
-}
-
-/**
- * newblock: get a new in-memory block set to zeros
- *
- *   @return: pointer to new block, NULL on error
- */
-void *newblock() {
-    void *block = malloc(BLOCK_SIZE);
-    if (block == NULL) {
-        perror("newblock");
-        return NULL;
-    }
-    memset(block, 0, BLOCK_SIZE);
-    return block;
-}
-
-
-/**
- * freeblock: unallocate an in-memory block
- *   @id: block id (zero if this is only in-memory)
- *   @block: block to be freed
- */
-void freeblock(void *block) {
-    if (block != NULL)
-        free(block);
-}
-
-
-int main(int argc, char **argv)
-{
-    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
-
-    if (block_fp < 0) {
-        perror("open");
-        return -1;
-    }
-
-    bssock = open_socket(BLOCKSTORED_PORT);
-    if (bssock < 0) {
-        return -1;
-    }
-
-    service_loop();
-    
-    close(bssock);
-
-    return 0;
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/bstest.c
--- a/tools/blktap/bstest.c     Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,191 +0,0 @@
-/**************************************************************************
- * 
- * bstest.c
- *
- * Block store daemon test program.
- *
- * usage: bstest <host>|X {r|w|a} ID 
- *
- */
-
-#include <fcntl.h>
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#include <netinet/in.h>
-#include <netdb.h>
-#include <errno.h>
-#include "blockstore.h"
-
-int direct(char *host, u32 op, u64 id, int len) {
-    struct sockaddr_in sn, peer;
-    int sock;
-    bsmsg_t msgbuf;
-    int rc, slen;
-    struct hostent *addr;
-
-    addr = gethostbyname(host);
-    if (!addr) {
-        perror("bad hostname");
-        exit(1);
-    }
-    peer.sin_family = addr->h_addrtype;
-    peer.sin_port = htons(BLOCKSTORED_PORT);
-    peer.sin_addr.s_addr =  ((struct in_addr *)(addr->h_addr))->s_addr;
-    fprintf(stderr, "Sending to: %u.%u.%u.%u\n",
-            (unsigned int)(unsigned char)addr->h_addr[0],
-            (unsigned int)(unsigned char)addr->h_addr[1],
-            (unsigned int)(unsigned char)addr->h_addr[2],
-            (unsigned int)(unsigned char)addr->h_addr[3]);
-
-    sock = socket(AF_INET, SOCK_DGRAM, 0);
-    if (sock < 0) {
-        perror("Bad socket");
-        exit(1);
-    }
-    memset(&sn, 0, sizeof(sn));
-    sn.sin_family = AF_INET;
-    sn.sin_port = htons(BLOCKSTORED_PORT);
-    sn.sin_addr.s_addr = htonl(INADDR_ANY);
-    if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) {
-        perror("bind");
-        close(sock);
-        exit(1);
-    }
-
-    memset((void *)&msgbuf, 0, sizeof(msgbuf));
-    msgbuf.operation = op;
-    msgbuf.id = id;
-
-    rc = sendto(sock, (void *)&msgbuf, len, 0,
-                (struct sockaddr *)&peer, sizeof(peer));
-    if (rc < 0) {
-        perror("sendto");
-        exit(1);
-    }
-
-    slen = sizeof(peer);
-    len = recvfrom(sock, (void *)&msgbuf, sizeof(msgbuf), 0,
-                   (struct sockaddr *)&peer, &slen);
-    if (len < 0) {
-        perror("recvfrom");
-        exit(1);
-    }
-
-    printf("Reply %u bytes:\n", len);
-    if (len >= MSGBUFSIZE_OP)
-        printf("  operation: %u\n", msgbuf.operation);
-    if (len >= MSGBUFSIZE_FLAGS)
-        printf("  flags: 0x%x\n", msgbuf.flags);
-    if (len >= MSGBUFSIZE_ID)
-        printf("  id: %llu\n", msgbuf.id);
-    if (len >= (MSGBUFSIZE_ID + 4))
-        printf("  data: %02x %02x %02x %02x...\n",
-               (unsigned int)msgbuf.block[0],
-               (unsigned int)msgbuf.block[1],
-               (unsigned int)msgbuf.block[2],
-               (unsigned int)msgbuf.block[3]);
-    
-    if (sock > 0)
-        close(sock);
-   
-    return 0;
-}
-
-int main (int argc, char **argv) {
-
-    u32 op = 0;
-    u64 id = 0;
-    int len = 0, rc;
-    void *block;
-
-    if (argc < 3) {
-        fprintf(stderr, "usage: bstest <host>|X {r|w|a} ID\n");
-        return 1;
-    }
-
-    switch (argv[2][0]) {
-    case 'r':
-    case 'R':
-        op = BSOP_READBLOCK;
-        len = MSGBUFSIZE_ID;
-        break;
-    case 'w':
-    case 'W':
-        op = BSOP_WRITEBLOCK;
-        len = MSGBUFSIZE_BLOCK;
-        break;
-    case 'a':
-    case 'A':
-        op = BSOP_ALLOCBLOCK;
-        len = MSGBUFSIZE_BLOCK;
-        break;
-    default:
-        fprintf(stderr, "Unknown action '%s'.\n", argv[2]);
-        return 1;
-    }
-
-    if (argc >= 4)
-        id = atoll(argv[3]);
-
-    if (strcmp(argv[1], "X") == 0) {
-        rc = __init_blockstore();
-        if (rc < 0) {
-            fprintf(stderr, "blockstore init failed.\n");
-            return 1;
-        }
-        switch(op) {
-        case BSOP_READBLOCK:
-            block = readblock(id);
-            if (block) {
-                printf("data: %02x %02x %02x %02x...\n",
-                       (unsigned int)((unsigned char*)block)[0],
-                       (unsigned int)((unsigned char*)block)[1],
-                       (unsigned int)((unsigned char*)block)[2],
-                       (unsigned int)((unsigned char*)block)[3]);
-            }
-            break;
-        case BSOP_WRITEBLOCK:
-            block = malloc(BLOCK_SIZE);
-            if (!block) {
-                perror("bstest malloc");
-                return 1;
-            }
-            memset(block, 0, BLOCK_SIZE);
-            rc = writeblock(id, block);
-            if (rc != 0) {
-                printf("error\n");
-            }
-            else {
-                printf("OK\n");
-            }
-            break;
-        case BSOP_ALLOCBLOCK:
-            block = malloc(BLOCK_SIZE);
-            if (!block) {
-                perror("bstest malloc");
-                return 1;
-            }
-            memset(block, 0, BLOCK_SIZE);
-            id = allocblock_hint(block, id);
-            if (id == 0) {
-                printf("error\n");
-            }
-            else {
-                printf("ID: %llu\n", id);
-            }
-            break;
-        }
-    }
-    else {
-        direct(argv[1], op, id, len);
-    }
-
-
-    return 0;
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax.c
--- a/tools/blktap/parallax.c   Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,611 +0,0 @@
-/**************************************************************************
- * 
- * parallax.c
- *
- * The Parallax Storage Server
- *
- */
- 
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <pthread.h>
-#include "blktaplib.h"
-#include "blockstore.h"
-#include "vdi.h"
-#include "block-async.h"
-#include "requests-async.h"
-
-#define PARALLAX_DEV     61440
-#define SECTS_PER_NODE   8
-
-
-#if 0
-#define DPRINTF(_f, _a...) printf ( _f , ## _a )
-#else
-#define DPRINTF(_f, _a...) ((void)0)
-#endif
-
-/* ------[ session records ]----------------------------------------------- */
-
-#define BLKIF_HASHSZ 1024
-#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1))
-
-#define VDI_HASHSZ 16
-#define VDI_HASH(_vd) ((((_vd)>>8)^(_vd))&(VDI_HASHSZ-1))
-
-typedef struct blkif {
-    domid_t       domid;
-    unsigned int  handle;
-    enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
-    vdi_t        *vdi_hash[VDI_HASHSZ];
-    struct blkif *hash_next;
-} blkif_t;
-
-static blkif_t      *blkif_hash[BLKIF_HASHSZ];
-
-blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle)
-{
-    if ( handle != 0 )
-        printf("blktap/parallax don't currently support non-0 dev handles!\n");
-    
-    blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)];
-    while ( (blkif != NULL) && 
-            ((blkif->domid != domid) || (blkif->handle != handle)) )
-        blkif = blkif->hash_next;
-    return blkif;
-}
-
-vdi_t *blkif_get_vdi(blkif_t *blkif, blkif_vdev_t device)
-{
-    vdi_t *vdi = blkif->vdi_hash[VDI_HASH(device)];
-    
-    while ((vdi != NULL) && (vdi->vdevice != device))
-        vdi = vdi->next;
-    
-    return vdi;
-}
-
-/* ------[ control message handling ]-------------------------------------- */
-
-void blkif_create(blkif_be_create_t *create)
-{
-    domid_t       domid  = create->domid;
-    unsigned int  handle = create->blkif_handle;
-    blkif_t     **pblkif, *blkif;
-
-    DPRINTF("parallax (blkif_create): create is %p\n", create); 
-    
-    if ( (blkif = (blkif_t *)malloc(sizeof(blkif_t))) == NULL )
-    {
-        DPRINTF("Could not create blkif: out of memory\n");
-        create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
-        return;
-    }
-
-    memset(blkif, 0, sizeof(*blkif));
-    blkif->domid  = domid;
-    blkif->handle = handle;
-    blkif->status = DISCONNECTED;
-
-    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
-    while ( *pblkif != NULL )
-    {
-        if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) )
-        {
-            DPRINTF("Could not create blkif: already exists (%d,%d)\n",
-                domid, handle);
-            create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS;
-            free(blkif);
-            return;
-        }
-        pblkif = &(*pblkif)->hash_next;
-    }
-
-    blkif->hash_next = *pblkif;
-    *pblkif = blkif;
-
-    DPRINTF("Successfully created blkif\n");
-    create->status = BLKIF_BE_STATUS_OKAY;
-}
-
-void blkif_destroy(blkif_be_destroy_t *destroy)
-{
-    domid_t       domid  = destroy->domid;
-    unsigned int  handle = destroy->blkif_handle;
-    blkif_t     **pblkif, *blkif;
-
-    DPRINTF("parallax (blkif_destroy): destroy is %p\n", destroy); 
-    
-    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
-    while ( (blkif = *pblkif) != NULL )
-    {
-        if ( (blkif->domid == domid) && (blkif->handle == handle) )
-        {
-            if ( blkif->status != DISCONNECTED )
-                goto still_connected;
-            goto destroy;
-        }
-        pblkif = &blkif->hash_next;
-    }
-
-    destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
-    return;
-
- still_connected:
-    destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
-    return;
-
- destroy:
-    *pblkif = blkif->hash_next;
-    free(blkif);
-    destroy->status = BLKIF_BE_STATUS_OKAY;
-}
-
-void vbd_create(blkif_be_vbd_create_t *create)
-{
-    blkif_t            *blkif;
-    vdi_t              *vdi, **vdip;
-    blkif_vdev_t        vdevice = create->vdevice;
-
-    DPRINTF("parallax (vbd_create): create=%p\n", create); 
-    
-    blkif = blkif_find_by_handle(create->domid, create->blkif_handle);
-    if ( blkif == NULL )
-    {
-        DPRINTF("vbd_create attempted for non-existent blkif (%u,%u)\n", 
-                create->domid, create->blkif_handle); 
-        create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
-        return;
-    }
-
-    /* VDI identifier is in grow->extent.sector_start */
-    DPRINTF("vbd_create: create->dev_handle (id) is %lx\n", 
-            (unsigned long)create->dev_handle);
-
-    vdi = vdi_get(create->dev_handle);
-    if (vdi == NULL)
-    {
-        printf("parallax (vbd_create): VDI %lx not found.\n",
-               (unsigned long)create->dev_handle);
-        create->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
-        return;
-    }
-    
-    vdi->next = NULL;
-    vdi->vdevice = vdevice;
-    vdip = &blkif->vdi_hash[VDI_HASH(vdevice)];
-    while (*vdip != NULL)
-        vdip = &(*vdip)->next;
-    *vdip = vdi;
-    
-    DPRINTF("blkif_create succeeded\n"); 
-    create->status = BLKIF_BE_STATUS_OKAY;
-}
-
-void vbd_destroy(blkif_be_vbd_destroy_t *destroy)
-{
-    blkif_t            *blkif;
-    vdi_t              *vdi, **vdip;
-    blkif_vdev_t        vdevice = destroy->vdevice;
-    
-    blkif = blkif_find_by_handle(destroy->domid, destroy->blkif_handle);
-    if ( blkif == NULL )
-    {
-        DPRINTF("vbd_destroy attempted for non-existent blkif (%u,%u)\n", 
-                destroy->domid, destroy->blkif_handle); 
-        destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
-        return;
-    }
-
-    vdip = &blkif->vdi_hash[VDI_HASH(vdevice)];
-    while ((*vdip != NULL) && ((*vdip)->vdevice != vdevice))
-        vdip = &(*vdip)->next;
-
-    if (*vdip != NULL) 
-    {
-        vdi = *vdip;
-        *vdip = vdi->next;
-        vdi_put(vdi);
-    }
-        
-}
-
-int parallax_control(control_msg_t *msg)
-{
-    domid_t  domid;
-    int      ret;
-
-    DPRINTF("parallax_control: msg is %p\n", msg); 
-    
-    if (msg->type != CMSG_BLKIF_BE) 
-    {
-        printf("Unexpected control message (%d)\n", msg->type);
-        return 0;
-    }
-    
-    switch(msg->subtype)
-    {
-    case CMSG_BLKIF_BE_CREATE:
-        if ( msg->length != sizeof(blkif_be_create_t) )
-            goto parse_error;
-        blkif_create((blkif_be_create_t *)msg->msg);
-        break;   
-        
-    case CMSG_BLKIF_BE_DESTROY:
-        if ( msg->length != sizeof(blkif_be_destroy_t) )
-            goto parse_error;
-        blkif_destroy((blkif_be_destroy_t *)msg->msg);
-        break;  
-        
-    case CMSG_BLKIF_BE_VBD_CREATE:
-        if ( msg->length != sizeof(blkif_be_vbd_create_t) )
-            goto parse_error;
-        vbd_create((blkif_be_vbd_create_t *)msg->msg);
-        break;
-        
-    case CMSG_BLKIF_BE_VBD_DESTROY:
-        if ( msg->length != sizeof(blkif_be_vbd_destroy_t) )
-            goto parse_error;
-        vbd_destroy((blkif_be_vbd_destroy_t *)msg->msg);
-        break;
-
-    case CMSG_BLKIF_BE_CONNECT:
-    case CMSG_BLKIF_BE_DISCONNECT:
-        /* we don't manage the device channel, the tap does. */
-        break;
-
-    default:
-        goto parse_error;
-    }
-    return 0;
-parse_error:
-    printf("Bad control message!\n");
-    return 0;
-    
-}    
-
-int parallax_probe(blkif_request_t *req, blkif_t *blkif)
-{
-    blkif_response_t *rsp;
-    vdisk_t *img_info;
-    vdi_t *vdi;
-    int i, nr_vdis = 0; 
-
-    DPRINTF("parallax_probe: req=%p, blkif=%p\n", req, blkif); 
-
-    /* We expect one buffer only. */
-    if ( req->nr_segments != 1 )
-      goto err;
-
-    /* Make sure the buffer is page-sized. */
-    if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) ||
-       (blkif_last_sect (req->frame_and_sects[0]) != 7) )
-      goto err;
-
-    /* fill the list of devices */
-    for (i=0; i<VDI_HASHSZ; i++) {
-        vdi = blkif->vdi_hash[i];
-        while (vdi) {
-            img_info = (vdisk_t *)MMAP_VADDR(ID_TO_IDX(req->id), 0);
-            img_info[nr_vdis].device   = vdi->vdevice;
-            img_info[nr_vdis].info     = 0;
-            /* The -1 here accounts for the LSB in the radix tree */
-            img_info[nr_vdis].capacity = 
-                    ((1LL << (VDI_HEIGHT-1)) * SECTS_PER_NODE);
-            nr_vdis++;
-            vdi = vdi->next;
-        }
-    }
-
-    
-    rsp = (blkif_response_t *)req;
-    rsp->id = req->id;
-    rsp->operation = BLKIF_OP_PROBE;
-    rsp->status = nr_vdis; /* number of disks */
-
-    DPRINTF("parallax_probe: send positive response (nr_vdis=%d)\n", nr_vdis);
-    return  BLKTAP_RESPOND;
-err:
-    rsp = (blkif_response_t *)req;
-    rsp->id = req->id;
-    rsp->operation = BLKIF_OP_PROBE;
-    rsp->status = BLKIF_RSP_ERROR;
-    
-    DPRINTF("parallax_probe: send error response\n"); 
-    return BLKTAP_RESPOND;  
-}
-
-typedef struct {
-    blkif_request_t *req;
-    int              count;
-    int              error;
-    pthread_mutex_t  mutex;
-} pending_t;
-
-#define MAX_REQUESTS 64
-pending_t pending_list[MAX_REQUESTS];
-
-struct cb_param {
-    pending_t *pent;
-    int       segment;
-    u64       sector; 
-    u64       vblock; /* for debug printing -- can be removed. */
-};
-
-static void read_cb(struct io_ret r, void *in_param)
-{
-    struct cb_param *param = (struct cb_param *)in_param;
-    pending_t *p = param->pent;
-    int segment = param->segment;
-    blkif_request_t *req = p->req;
-    unsigned long size, offset, start;
-    char *dpage, *spage;
-       
-    spage  = IO_BLOCK(r);
-    if (spage == NULL) { p->error++; goto finish; }
-    dpage  = (char *)MMAP_VADDR(ID_TO_IDX(req->id), segment);
-    
-    /* Calculate read size and offset within the read block. */
-
-    offset = (param->sector << SECTOR_SHIFT) % BLOCK_SIZE;
-    size = ( blkif_last_sect (req->frame_and_sects[segment]) -
-             blkif_first_sect(req->frame_and_sects[segment]) + 1
-        ) << SECTOR_SHIFT;
-    start = blkif_first_sect(req->frame_and_sects[segment]) 
-        << SECTOR_SHIFT;
-
-    DPRINTF("ParallaxRead: sect: %lld (%ld,%ld),  "
-            "vblock %llx, "
-            "size %lx\n", 
-            param->sector, blkif_first_sect(p->req->frame_and_sects[segment]),
-            blkif_last_sect (p->req->frame_and_sects[segment]),
-            param->vblock, size); 
-
-    memcpy(dpage + start, spage + offset, size);
-    freeblock(spage);
-    
-    /* Done the read.  Now update the pending record. */
- finish:
-    pthread_mutex_lock(&p->mutex);
-    p->count--;
-    
-    if (p->count == 0) {
-       blkif_response_t *rsp;
-       
-        rsp = (blkif_response_t *)req;
-        rsp->id = req->id;
-        rsp->operation = BLKIF_OP_READ;
-       if (p->error == 0) {
-            rsp->status = BLKIF_RSP_OKAY;
-       } else {
-            rsp->status = BLKIF_RSP_ERROR;
-       }
-        blktap_inject_response(rsp);       
-    }
-    
-    pthread_mutex_unlock(&p->mutex);
-       
-    free(param); /* TODO: replace with cached alloc/dealloc */
-}      
-
-int parallax_read(blkif_request_t *req, blkif_t *blkif)
-{
-    blkif_response_t *rsp;
-    u64 vblock, gblock;
-    vdi_t *vdi;
-    u64 sector;
-    int i;
-    char *dpage, *spage;
-    pending_t *pent;
-
-    vdi = blkif_get_vdi(blkif, req->device);
-    
-    if ( vdi == NULL )
-        goto err;
-        
-    pent = &pending_list[ID_TO_IDX(req->id)];
-    pent->count = req->nr_segments;
-    pent->req = req;
-    pthread_mutex_init(&pent->mutex, NULL);
-    
-    for (i = 0; i < req->nr_segments; i++) {
-        pthread_t tid;
-        int ret;
-        struct cb_param *p;
-        
-        /* Round the requested segment to a block address. */
-        sector  = req->sector_number + (8*i);
-        vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT;
-        
-        /* TODO: Replace this call to malloc with a cached allocation */
-        p = (struct cb_param *)malloc(sizeof(struct cb_param));
-        p->pent = pent;
-        p->sector = sector; 
-        p->segment = i;     
-        p->vblock = vblock; /* dbg */
-        
-        /* Get that block from the store. */
-        vdi_read(vdi, vblock, read_cb, (void *)p);    
-    }
-    
-    return BLKTAP_STOLEN;
-
-err:
-    rsp = (blkif_response_t *)req;
-    rsp->id = req->id;
-    rsp->operation = BLKIF_OP_READ;
-    rsp->status = BLKIF_RSP_ERROR;
-    
-    return BLKTAP_RESPOND;  
-}
-
-static void write_cb(struct io_ret r, void *in_param)
-{
-    struct cb_param *param = (struct cb_param *)in_param;
-    pending_t *p = param->pent;
-    blkif_request_t *req = p->req;
-    
-    /* catch errors from the block code. */
-    if (IO_INT(r) < 0) p->error++;
-    
-    pthread_mutex_lock(&p->mutex);
-    p->count--;
-    
-    if (p->count == 0) {
-       blkif_response_t *rsp;
-       
-        rsp = (blkif_response_t *)req;
-        rsp->id = req->id;
-        rsp->operation = BLKIF_OP_WRITE;
-       if (p->error == 0) {
-            rsp->status = BLKIF_RSP_OKAY;
-       } else {
-            rsp->status = BLKIF_RSP_ERROR;
-       }
-        blktap_inject_response(rsp);       
-    }
-    
-    pthread_mutex_unlock(&p->mutex);
-       
-    free(param); /* TODO: replace with cached alloc/dealloc */
-}
-
-int parallax_write(blkif_request_t *req, blkif_t *blkif)
-{
-    blkif_response_t *rsp;
-    u64 sector;
-    int i, writable = 0;
-    u64 vblock, gblock;
-    char *spage;
-    unsigned long size, offset, start;
-    vdi_t *vdi;
-    pending_t *pent;
-
-    vdi = blkif_get_vdi(blkif, req->device);
-    
-    if ( vdi == NULL )
-        goto err;
-        
-    pent = &pending_list[ID_TO_IDX(req->id)];
-    pent->count = req->nr_segments;
-    pent->req = req;
-    pthread_mutex_init(&pent->mutex, NULL);
-    
-    for (i = 0; i < req->nr_segments; i++) {
-        struct cb_param *p;
-        
-        spage  = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
-        
-        /* Round the requested segment to a block address. */
-        
-        sector  = req->sector_number + (8*i);
-        vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT;
-        
-        /* Calculate read size and offset within the read block. */
-        
-        offset = (sector << SECTOR_SHIFT) % BLOCK_SIZE;
-        size = ( blkif_last_sect (req->frame_and_sects[i]) -
-                 blkif_first_sect(req->frame_and_sects[i]) + 1
-            ) << SECTOR_SHIFT;
-        start = blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT;
-
-        DPRINTF("ParallaxWrite: sect: %lld (%ld,%ld),  "
-                "vblock %llx, gblock %llx, "
-                "size %lx\n", 
-                sector, blkif_first_sect(req->frame_and_sects[i]),
-                blkif_last_sect (req->frame_and_sects[i]),
-                vblock, gblock, size); 
-      
-        /* XXX: For now we just freak out if they try to write a   */
-        /* non block-sized, block-aligned page.                    */
-        
-        if ((offset != 0) || (size != BLOCK_SIZE) || (start != 0)) {
-            printf("]\n] STRANGE WRITE!\n]\n");
-            goto err;
-        }
-        
-        /* TODO: Replace this call to malloc with a cached allocation */
-        p = (struct cb_param *)malloc(sizeof(struct cb_param));
-        p->pent = pent;
-        p->sector = sector; 
-        p->segment = i;     
-        p->vblock = vblock; /* dbg */
-        
-        /* Issue the write to the store. */
-        vdi_write(vdi, vblock, spage, write_cb, (void *)p);
-    }
-
-    return BLKTAP_STOLEN;
-
-err:
-    rsp = (blkif_response_t *)req;
-    rsp->id = req->id;
-    rsp->operation = BLKIF_OP_WRITE;
-    rsp->status = BLKIF_RSP_ERROR;
-    
-    return BLKTAP_RESPOND;  
-}
-
-int parallax_request(blkif_request_t *req)
-{
-    blkif_response_t *rsp;
-    domid_t  dom   = ID_TO_DOM(req->id);
-    blkif_t *blkif = blkif_find_by_handle(dom, 0);
-    
-    if (blkif == NULL)
-        goto err;
-    
-    if ( req->operation == BLKIF_OP_PROBE ) {
-        
-        return parallax_probe(req, blkif);
-        
-    } else if ( req->operation == BLKIF_OP_READ ) {
-        
-        return parallax_read(req, blkif);
-        
-    } else if ( req->operation == BLKIF_OP_WRITE ) {
-        
-        return parallax_write(req, blkif);
-        
-    } else {
-        printf("Unknown request message type!\n");
-        /* Unknown operation */
-        goto err;
-    }
-    
-err:
-    rsp = (blkif_response_t *)req;
-    rsp->operation = req->operation;
-    rsp->id = req->id;
-    rsp->status = BLKIF_RSP_ERROR;
-    return BLKTAP_RESPOND;  
-}
-
-void __init_parallax(void) 
-{
-    memset(blkif_hash, 0, sizeof(blkif_hash));
-}
-
-
-
-int main(int argc, char *argv[])
-{
-    DPRINTF("parallax: starting.\n"); 
-    __init_blockstore();
-    DPRINTF("parallax: initialized blockstore...\n"); 
-    init_block_async();
-    DPRINTF("parallax: initialized async blocks...\n"); 
-    __init_vdi();
-    DPRINTF("parallax: initialized vdi registry etc...\n"); 
-    __init_parallax();
-    DPRINTF("parallax: initialized local stuff..\n"); 
-
-    blktap_register_ctrl_hook("parallax_control", parallax_control);
-    blktap_register_request_hook("parallax_request", parallax_request);
-    DPRINTF("parallax: added ctrl + request hooks, starting listen...\n"); 
-    blktap_listen();
-    
-    return 0;
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/radix.c
--- a/tools/blktap/radix.c      Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,631 +0,0 @@
-/*
- * Radix tree for mapping (up to) 63-bit virtual block IDs to
- * 63-bit global block IDs
- *
- * Pointers within the tree set aside the least significant bit to indicate
- * whther or not the target block is writable from this node.
- *
- * The block with ID 0 is assumed to be an empty block of all zeros
- */
-
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <string.h>
-#include <pthread.h>
-#include "blockstore.h"
-#include "radix.h"
-
-#define RADIX_TREE_MAP_SHIFT 9
-#define RADIX_TREE_MAP_MASK 0x1ff
-#define RADIX_TREE_MAP_ENTRIES 512
-
-/*
-#define DEBUG
-*/
-
-/* Experimental radix cache. */
-
-static  pthread_mutex_t rcache_mutex = PTHREAD_MUTEX_INITIALIZER;
-static  int rcache_count = 0;
-#define RCACHE_MAX 1024
-
-typedef struct rcache_st {
-    radix_tree_node  *node;
-    u64               id;
-    struct rcache_st *hash_next;
-    struct rcache_st *cache_next;
-    struct rcache_st *cache_prev;
-} rcache_t;
-
-static rcache_t *rcache_head = NULL;
-static rcache_t *rcache_tail = NULL;
-
-#define RCHASH_SIZE 512ULL
-rcache_t *rcache[RCHASH_SIZE];
-#define RCACHE_HASH(_id) ((_id) & (RCHASH_SIZE - 1))
-
-void __rcache_init(void)
-{
-    int i;
-
-    for (i=0; i<RCHASH_SIZE; i++)
-        rcache[i] = NULL;
-}
-    
-
-void rcache_write(u64 id, radix_tree_node *node)
-{
-    rcache_t *r, *tmp, **curs;
-    
-    pthread_mutex_lock(&rcache_mutex);
-    
-    /* Is it already in the cache? */
-    r = rcache[RCACHE_HASH(id)];
-    
-    for (;;) {
-        if (r == NULL) 
-            break;
-        if (r->id == id) 
-        {
-            memcpy(r->node, node, BLOCK_SIZE);
-            
-            /* bring to front. */
-            if (r != rcache_head) {
-                
-                if (r == rcache_tail) {
-                    if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
-                    rcache_tail->cache_next = NULL;
-                }
-
-                tmp = r->cache_next;
-                if (r->cache_next != NULL) r->cache_next->cache_prev 
-                                                     = r->cache_prev;
-                if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp;
-
-                r->cache_prev = NULL;
-                r->cache_next = rcache_head;
-                if (rcache_head != NULL) rcache_head->cache_prev = r;
-                rcache_head = r;
-            }
-
-//printf("Update (%Ld)\n", r->id);
-            goto done;
-        }
-        r = r->hash_next;
-    }
-    
-    if ( rcache_count == RCACHE_MAX ) 
-    {
-        /* Remove an entry */
-        
-        r = rcache_tail;
-        if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
-        rcache_tail->cache_next = NULL;
-        freeblock(r->node);
-        
-        curs = &rcache[RCACHE_HASH(r->id)];
-        while ((*curs) != r)
-            curs = &(*curs)->hash_next;
-        *curs = r->hash_next;
-//printf("Evict (%Ld)\n", r->id);
-        
-    } else {
-        
-        r = (rcache_t *)malloc(sizeof(rcache_t));
-        rcache_count++;
-    }
-    
-    r->node = newblock();
-    memcpy(r->node, node, BLOCK_SIZE);
-    r->id = id;
-    
-    r->hash_next = rcache[RCACHE_HASH(id)];
-    rcache[RCACHE_HASH(id)] = r;
-    
-    r->cache_prev = NULL;
-    r->cache_next = rcache_head;
-    if (rcache_head != NULL) rcache_head->cache_prev = r;
-    rcache_head = r;
-    if (rcache_tail == NULL) rcache_tail = r;
-    
-//printf("Added (%Ld, %p)\n", id, r->node);
-done:
-    pthread_mutex_unlock(&rcache_mutex);
-}
-
-radix_tree_node *rcache_read(u64 id)
-{
-    rcache_t *r, *tmp;
-    radix_tree_node *node = NULL;
-    
-    pthread_mutex_lock(&rcache_mutex);
-
-    r = rcache[RCACHE_HASH(id)];
-    
-    for (;;) {
-        if (r == NULL) {
-//printf("Miss (%Ld)\n", id);
-            goto done;
-        }
-        if (r->id == id) break;
-        r = r->hash_next;
-    }
-   
-    /* bring to front. */
-    if (r != rcache_head) 
-    {
-        if (r == rcache_tail) {
-            if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
-            rcache_tail->cache_next = NULL;
-        }
-        tmp = r->cache_next;
-        if (r->cache_next != NULL) r->cache_next->cache_prev = r->cache_prev;
-        if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp;
-
-        r->cache_prev = NULL;
-        r->cache_next = rcache_head;
-        if (rcache_head != NULL) rcache_head->cache_prev = r;
-        rcache_head = r;
-    }
-    
-    node = newblock();
-    memcpy(node, r->node, BLOCK_SIZE);
-    
-//printf("Hit (%Ld, %p)\n", id, r->node);
-done:
-    pthread_mutex_unlock(&rcache_mutex);
-    
-    return(node);
-}
-
-
-void *rc_readblock(u64 id)
-{
-    void *ret;
-    
-    ret = (void *)rcache_read(id);
-    
-    if (ret != NULL) return ret;
-    
-    ret = readblock(id);
-    
-    if (ret != NULL)
-        rcache_write(id, ret);
-    
-    return(ret);
-}
-
-u64 rc_allocblock(void *block)
-{
-    u64 ret;
-    
-    ret = allocblock(block);
-    
-    if (ret != ZERO)
-        rcache_write(ret, block);
-    
-    return(ret);
-}
-
-int rc_writeblock(u64 id, void *block)
-{
-    int ret;
-    
-    ret = writeblock(id, block);
-    rcache_write(id, block);
-    
-    return(ret);
-}
-
-
-/*
- * block device interface and other helper functions
- * with these functions, block id is just a 63-bit number, with
- * no special consideration for the LSB
- */
-radix_tree_node cloneblock(radix_tree_node block);
-
-/*
- * main api
- * with these functions, the LSB of root always indicates
- * whether or not the block is writable, including the return
- * values of update and snapshot
- */
-u64 lookup(int height, u64 root, u64 key);
-u64 update(int height, u64 root, u64 key, u64 val);
-u64 snapshot(u64 root);
-
-/**
- * cloneblock: clone an existing block in memory
- *   @block: the old block
- *
- *   @return: new block, with LSB cleared for every entry
- */
-radix_tree_node cloneblock(radix_tree_node block) {
-    radix_tree_node node = (radix_tree_node) malloc(BLOCK_SIZE);
-    int i;
-    if (node == NULL) {
-        perror("cloneblock malloc");
-        return NULL;
-    }
-    for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++)
-        node[i] = block[i] & ONEMASK;
-    return node;
-}
-
-/**
- * lookup: find a value given a key
- *   @height: height in bits of the radix tree
- *   @root: root node id, with set LSB indicating writable node
- *   @key: key to lookup
- *
- *   @return: value on success, zero on error
- */
-
-u64 lookup(int height, u64 root, u64 key) {
-    radix_tree_node node;
-    u64 mask = ONE;
-    
-    assert(key >> height == 0);
-
-    /* the root block may be smaller to ensure all leaves are full */
-    height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
-
-    /* now carve off equal sized chunks at each step */
-    for (;;) {
-        u64 oldroot;
-
-#ifdef DEBUG
-        printf("lookup: height=%3d root=%3Ld offset=%3d%s\n", height, root,
-                (int) ((key >> height) & RADIX_TREE_MAP_MASK),
-                (iswritable(root) ? "" : " (readonly)"));
-#endif
-        
-        if (getid(root) == ZERO)
-            return ZERO;
-
-        oldroot = root;
-        node = (radix_tree_node) rc_readblock(getid(root));
-        if (node == NULL)
-            return ZERO;
-
-        root = node[(key >> height) & RADIX_TREE_MAP_MASK];
-        mask &= root;
-        freeblock(node);
-
-        if (height == 0)
-            return ( root & ONEMASK ) | mask;
-
-        height -= RADIX_TREE_MAP_SHIFT;
-    }
-
-    return ZERO;
-}
-
-/*
- * update: set a radix tree entry, doing copy-on-write as necessary
- *   @height: height in bits of the radix tree
- *   @root: root node id, with set LSB indicating writable node
- *   @key: key to set
- *   @val: value to set, s.t. radix(key)=val
- *
- *   @returns: (possibly new) root id on success (with LSB=1), 0 on failure
- */
-
-u64 update(int height, u64 root, u64 key, u64 val) {
-    int offset;
-    u64 child;
-    radix_tree_node node;
-    
-    /* base case--return val */
-    if (height == 0)
-        return val;
-
-    /* the root block may be smaller to ensure all leaves are full */
-    height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
-    offset = (key >> height) & RADIX_TREE_MAP_MASK;
-
-#ifdef DEBUG
-    printf("update: height=%3d root=%3Ld offset=%3d%s\n", height, root,
-            offset, (iswritable(root)?"":" (clone)"));
-#endif
-
-    /* load a block, or create a new one */
-    if (root == ZERO) {
-        node = (radix_tree_node) newblock();
-    } else {
-        node = (radix_tree_node) rc_readblock(getid(root));
-
-        if (!iswritable(root)) {
-            /* need to clone this node */
-            radix_tree_node oldnode = node;
-            node = cloneblock(node);
-            freeblock(oldnode);
-            root = ZERO;
-        }
-    }
-
-    if (node == NULL) {
-#ifdef DEBUG
-        printf("update: node is null!\n");
-#endif
-        return ZERO;
-    }
-
-    child = update(height, node[offset], key, val);
-
-    if (child == ZERO) {
-        freeblock(node);
-        return ZERO;
-    } else if (child == node[offset]) {
-        /* no change, so we already owned the child */
-        assert(iswritable(root));
-
-        freeblock(node);
-        return root;
-    }
-
-    node[offset] = child;
-
-    /* new/cloned blocks need to be saved */
-    if (root == ZERO) {
-        /* mark this as an owned block */
-        root = rc_allocblock(node);
-        if (root)
-            root = writable(root);
-    } else if (rc_writeblock(getid(root), node) < 0) {
-        freeblock(node);
-        return ZERO;
-    }
-
-    freeblock(node);
-    return root;
-}
-
-/**
- * snapshot: create a snapshot
- *   @root: old root node
- *
- *   @return: new root node, 0 on error
- */
-u64 snapshot(u64 root) {
-    radix_tree_node node, newnode;
-
-    if ((node = rc_readblock(getid(root))) == NULL)
-        return ZERO;
-
-    newnode = cloneblock(node);
-    freeblock(node);
-    if (newnode == NULL)
-        return ZERO;
-    
-    root = rc_allocblock(newnode);
-    freeblock(newnode);
-
-    if (root == ZERO)
-        return ZERO;
-    else
-        return writable(root);
-}
-
-/**
- * collapse: collapse a parent onto a child.
- * 
- * NOTE: This assumes that parent and child really are, and further that
- * there are no other children forked from this parent. (children of the
- * child are okay...)
- */
-
-int collapse(int height, u64 proot, u64 croot)
-{
-    int i, numlinks, ret, total = 0;
-    radix_tree_node pnode, cnode;
-    
-    if (height == 0) {
-        height = -1; /* terminate recursion */
-    } else {        
-        height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
-    }
-    numlinks = (1UL << RADIX_TREE_MAP_SHIFT);
-
-    /* Terminal cases: */
-
-    if ( (getid(proot) == ZERO) || (getid(croot) == ZERO) )
-        return -1;
-    
-    /* get roots */
-    if ((pnode = readblock(getid(proot))) == NULL)
-        return -1;
-    
-    if ((cnode = readblock(getid(croot))) == NULL)
-    {
-        freeblock(pnode);
-        return -1;
-    }
-    
-    /* For each writable link in proot */
-    for (i=0; i<numlinks; i++)
-    {
-        if ( pnode[i] == cnode[i] ) continue;
-        
-        /* collapse (next level) */
-        /* if height != 0 and writable... */
-        if (( height >= 0 ) && ( iswritable(pnode[i]) ) )
-        {
-            //printf("   %Ld is writable (i=%d).\n", getid(pnode[i]), i);
-            ret = collapse(height, pnode[i], cnode[i]);
-            if (ret == -1) 
-            {
-                total = -1;
-            } else {
-                total += ret;
-            }
-        }
-    
-        
-    }
-    
-    /* if plink is writable, AND clink is writable -> free plink block */
-    if ( ( iswritable(proot) ) && ( iswritable(croot) ) ) 
-    {
-        releaseblock(getid(proot));
-        if (ret >=0) total++;
-        //printf("   Delete %Ld\n", getid(proot));
-    }
-//printf("done : %Ld\n", getid(proot));
-    return total;
-
-}
-
-
-void print_root(u64 root, int height, FILE *dot_f)
-{
-    FILE *f;
-    int i;
-    radix_tree_node node;
-    char *style[2] = { "", "style=bold,color=blue," };
-    
-    if (dot_f == NULL) {
-        f = fopen("radix.dot", "w");
-        if (f == NULL) {
-            perror("print_root: open");
-            return;
-        }
-
-        /* write graph preamble */
-        fprintf(f, "digraph G {\n");
-
-        /* add a node for this root. */
-        fprintf(f, "   n%Ld [%sshape=box,label=\"%Ld\"];\n", 
-                getid(root), style[iswritable(root)], getid(root));
-    }
-    
-    printf("print_root(%Ld)\n", getid(root));
-    
-    /* base case */
-    if (height == 0) {
-        /* add a node and edge for each child root */
-        node = (radix_tree_node) readblock(getid(root));
-        if (node == NULL)
-            return;
-        
-        for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++) {
-            if (node[i] != ZERO) {
-                fprintf(f, "   n%Ld [%sshape=box,label=\"%Ld\"];\n", 
-                        getid(node[i]), style[iswritable(node[i])], 
-                        getid(node[i]));
-                fprintf(f, "   n%Ld -> n%Ld [label=\"%d\"]\n", getid(root), 
-                        getid(node[i]), i);
-            }
-        }
-        freeblock(node);
-        return;
-    }
-
-    /* the root block may be smaller to ensure all leaves are full */
-    height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
-
-    if (getid(root) == ZERO)
-        return;
-
-    node = (radix_tree_node) readblock(getid(root));
-    if (node == NULL)
-        return;
-
-    /* add a node and edge for each child root */
-    for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++)
-        if (node[i] != ZERO) {
-            fprintf(f, "   n%Ld [%sshape=box,label=\"%Ld\"];\n", 
-                    getid(node[i]), style[iswritable(node[i])], 
-                    getid(node[i]));
-
-            print_root(node[i], height-RADIX_TREE_MAP_SHIFT, f);
-            fprintf(f, "   n%Ld -> n%Ld [label=\"%d\"]\n", getid(root), 
-                    getid(node[i]), i);
-        }
-
-    freeblock(node);
-    
-    /* write graph postamble */
-    if (dot_f == NULL) {
-        fprintf(f, "}\n");
-        fclose(f);
-    }
-}
-
-#ifdef RADIX_STANDALONE
-
-int main(int argc, char **argv) {
-    u64 key = ZERO, val = ZERO;
-    u64 root = writable(2ULL);
-    u64 p = ZERO, c = ZERO;
-    int v;
-    char buff[4096];
-
-    __init_blockstore();
-    
-    memset(buff, 0, 4096);
-    /*fp = open("radix.dat", O_RDWR | O_CREAT, 0644);
-
-    if (fp < 3) {
-        perror("open");
-        return -1;
-    }
-    if (lseek(fp, 0, SEEK_END) == 0) {
-        write(fp, buff, 4096);
-    }*/
-        
-    allocblock(buff);
-            
-    printf("Recognized commands:\n"
-           "Note: the LSB of a node number indicates if it is writable\n"
-           "  root <node>               set root to <node>\n"
-           "  snapshot                  take a snapshot of the root\n"
-           "  set <key> <val>           set key=val\n"
-           "  get <key>                 query key\n"
-           "  c <proot> <croot>         collapse\n"
-           "  pr                        print tree to dot\n"
-           "  pf <1=verbose>            print freelist\n"
-           "  quit\n"
-           "\nroot = %Ld\n", root);
-    for (;;) {
-        //print_root(root, 34, NULL);
-        //system("dot radix.dot -Tps -o radix.ps");
-
-        printf("> ");
-        fflush(stdout);
-        fgets(buff, 1024, stdin);
-        if (feof(stdin))
-            break;
-        if (sscanf(buff, " root %Ld", &root) == 1) {
-            printf("root set to %Ld\n", root);
-        } else if (sscanf(buff, " set %Ld %Ld", &key, &val) == 2) {
-            root = update(34, root, key, val);
-            printf("root = %Ld\n", root);
-        } else if (sscanf(buff, " c %Ld %Ld", &p, &c) == 2) {
-            v = collapse(34, p, c);
-            printf("reclaimed %d blocks.\n", v);
-        } else if (sscanf(buff, " get %Ld", &key) == 1) {
-            val = lookup(34, root, key);
-            printf("value = %Ld\n", val);
-        } else if (!strcmp(buff, "quit\n")) {
-            break;
-        } else if (!strcmp(buff, "snapshot\n")) {
-            root = snapshot(root);
-            printf("new root = %Ld\n", root);
-        } else if (sscanf(buff, " pr %Ld", &root) == 1) {
-            print_root(root, 34, NULL);
-        } else if (sscanf(buff, " pf %d", &v) == 1) {
-            freelist_count(v);
-        } else if (!strcmp(buff, "pf\n")) {
-            freelist_count(0);
-        } else {
-            printf("command not recognized\n");
-        }
-    }
-    return 0;
-}
-
-#endif
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/radix.h
--- a/tools/blktap/radix.h      Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,45 +0,0 @@
-/*
- * Radix tree for mapping (up to) 63-bit virtual block IDs to
- * 63-bit global block IDs
- *
- * Pointers within the tree set aside the least significant bit to indicate
- * whther or not the target block is writable from this node.
- *
- * The block with ID 0 is assumed to be an empty block of all zeros
- */
-
-#ifndef __RADIX_H__
-#define __RADIX_H__
-
-/* I don't really like exposing these, but... */
-#define getid(x) (((x)>>1)&0x7fffffffffffffffLL)
-#define putid(x) ((x)<<1)
-#define writable(x) (((x)<<1)|1LL)
-#define iswritable(x) ((x)&1LL)
-#define ZERO 0LL
-#define ONE 1LL
-#define ONEMASK 0xffffffffffffffeLL
-
-#define RADIX_TREE_MAP_SHIFT 9
-#define RADIX_TREE_MAP_MASK 0x1ff
-#define RADIX_TREE_MAP_ENTRIES 512
-
-typedef u64 *radix_tree_node;
-
-
-/*
- * main api
- * with these functions, the LSB of root always indicates
- * whether or not the block is writable, including the return
- * values of update and snapshot
- */
-u64 lookup(int height, u64 root, u64 key);
-u64 update(int height, u64 root, u64 key, u64 val);
-u64 snapshot(u64 root);
-int collapse(int height, u64 proot, u64 croot);
-int isprivate(int height, u64 root, u64 key);
-
-
-void __rcache_init(void);
-
-#endif /* __RADIX_H__ */
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/requests-async.c
--- a/tools/blktap/requests-async.c     Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,762 +0,0 @@
-/* requests-async.c
- *
- * asynchronous request dispatcher for radix access in parallax.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-#include <assert.h>
-#include <pthread.h>
-#include <err.h>
-#include <zlib.h> /* for crc32() */
-#include "requests-async.h"
-#include "vdi.h"
-#include "radix.h"
-
-#define L1_IDX(_a) (((_a) & 0x0000000007fc0000ULL) >> 18)
-#define L2_IDX(_a) (((_a) & 0x000000000003fe00ULL) >> 9)
-#define L3_IDX(_a) (((_a) & 0x00000000000001ffULL))
-
-
-#if 0
-#define DPRINTF(_f, _a...) printf ( _f , ## _a )
-#else
-#define DPRINTF(_f, _a...) ((void)0)
-#endif
-
-struct block_info {
-    u32        crc;
-    u32        unused;
-};
-
-struct io_req {
-    enum { IO_OP_READ, IO_OP_WRITE } op;
-    u64        root;
-    u64        vaddr;
-    int        state;
-    io_cb_t    cb;
-    void      *param;
-    struct radix_lock *lock;
-
-    /* internal stuff: */
-    struct io_ret     retval;/* holds the return while we unlock. */
-    char             *block; /* the block to write */
-    radix_tree_node   radix[3];
-    u64               radix_addr[3];
-    struct block_info bi;
-};
-
-void clear_w_bits(radix_tree_node node) 
-{
-    int i;
-    for (i=0; i<RADIX_TREE_MAP_ENTRIES; i++)
-        node[i] = node[i] & ONEMASK;
-    return;
-}
-
-void clear_L3_w_bits(radix_tree_node node) 
-{
-    int i;
-    for (i=0; i<RADIX_TREE_MAP_ENTRIES; i+=2)
-        node[i] = node[i] & ONEMASK;
-    return;
-}
-
-enum states {
-    /* both */
-    READ_L1,
-    READ_L2,
-    READ_L3,
-
-    /* read */
-    READ_LOCKED,
-    READ_DATA,
-    READ_UNLOCKED,
-    RETURN_ZERO,
-
-    /* write */
-    WRITE_LOCKED,
-    WRITE_DATA,
-    WRITE_L3,
-    WRITE_UNLOCKED,
-    
-    /* L3 Zero Path */
-    ALLOC_DATA_L3z,
-    WRITE_L3_L3z,
-    
-    /* L3 Fault Path */
-    ALLOC_DATA_L3f,
-    WRITE_L3_L3f,
-    
-    /* L2 Zero Path */
-    ALLOC_DATA_L2z,
-    WRITE_L2_L2z,
-    ALLOC_L3_L2z,
-    WRITE_L2_L3z,
-    
-    /* L2 Fault Path */
-    READ_L3_L2f,
-    ALLOC_DATA_L2f,
-    WRITE_L2_L2f,
-    ALLOC_L3_L2f,
-    WRITE_L2_L3f,
-
-    /* L1 Zero Path */
-    ALLOC_DATA_L1z,
-    ALLOC_L3_L1z,
-    ALLOC_L2_L1z,
-    WRITE_L1_L1z,
-
-    /* L1 Fault Path */
-    READ_L2_L1f,
-    READ_L3_L1f,
-    ALLOC_DATA_L1f,
-    ALLOC_L3_L1f,
-    ALLOC_L2_L1f,
-    WRITE_L1_L1f,
-    
-};
-
-enum radix_offsets {
-    L1 = 0, 
-    L2 = 1,
-    L3 = 2
-};
-
-
-static void read_cb(struct io_ret ret, void *param);
-static void write_cb(struct io_ret ret, void *param);
-
-int vdi_read(vdi_t *vdi, u64 vaddr, io_cb_t cb, void *param)
-{
-    struct io_req *req;
-
-    if (!VALID_VADDR(vaddr)) return ERR_BAD_VADDR;
-    /* Every second line in the bottom-level radix tree is used to      */
-    /* store crc32 values etc. We shift the vadder here to achied this. */
-    vaddr <<= 1;
-
-    req = (struct io_req *)malloc(sizeof (struct io_req));
-    if (req == NULL) return ERR_NOMEM;
-
-    req->radix[0] = req->radix[1] = req->radix[2] = NULL;      
-    req->op    = IO_OP_READ;
-    req->root  = vdi->radix_root;
-    req->lock  = vdi->radix_lock; 
-    req->vaddr = vaddr;
-    req->cb    = cb;
-    req->param = param;
-    req->state = READ_LOCKED;
-
-    block_rlock(req->lock, L1_IDX(vaddr), read_cb, req);
-       
-    return 0;
-}
-
-
-int   vdi_write(vdi_t *vdi, u64 vaddr, char *block, 
-                io_cb_t cb, void *param)
-{
-    struct io_req *req;
-
-    if (!VALID_VADDR(vaddr)) return ERR_BAD_VADDR;
-    /* Every second line in the bottom-level radix tree is used to      */
-    /* store crc32 values etc. We shift the vadder here to achied this. */
-    vaddr <<= 1;
-
-    req = (struct io_req *)malloc(sizeof (struct io_req));
-    if (req == NULL) return ERR_NOMEM; 
-
-    req->radix[0] = req->radix[1] = req->radix[2] = NULL;
-    req->op     = IO_OP_WRITE;
-    req->root   = vdi->radix_root;
-    req->lock   = vdi->radix_lock; 
-    req->vaddr  = vaddr;
-    req->block  = block;
-    /* Todo: add a pseodoheader to the block to include some location   */
-    /* information in the CRC as well.                                  */
-    req->bi.crc = (u32) crc32(0L, Z_NULL, 0); 
-    req->bi.crc = (u32) crc32(req->bi.crc, block, BLOCK_SIZE); 
-    req->bi.unused = 0xdeadbeef;
-
-    req->cb     = cb;
-    req->param  = param;
-    req->radix_addr[L1] = getid(req->root); /* for consistency */
-    req->state  = WRITE_LOCKED;
-
-    block_wlock(req->lock, L1_IDX(vaddr), write_cb, req);
-
-
-    return 0;
-}
-
-static void read_cb(struct io_ret ret, void *param)
-{
-    struct io_req *req = (struct io_req *)param;
-    radix_tree_node node;
-    u64 idx;
-    char *block;
-    void *req_param;
-
-    DPRINTF("read_cb\n");
-    /* get record */
-    switch(req->state) {
-       
-    case READ_LOCKED: 
-    
-        DPRINTF("READ_LOCKED\n");
-       req->state = READ_L1;
-       block_read(getid(req->root), read_cb, req); 
-       break;
-       
-    case READ_L1: /* block is the radix root */
-
-        DPRINTF("READ_L1\n");
-        block = IO_BLOCK(ret);
-        if (block == NULL) goto fail;
-        node = (radix_tree_node) block;
-        idx  = getid( node[L1_IDX(req->vaddr)] );
-        free(block);
-        if ( idx == ZERO ) {
-            req->state = RETURN_ZERO;
-            block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
-        } else {
-            req->state = READ_L2;
-            block_read(idx, read_cb, req);
-        }
-        break;
-
-    case READ_L2:
-
-        DPRINTF("READ_L2\n");
-        block = IO_BLOCK(ret);
-        if (block == NULL) goto fail;
-        node = (radix_tree_node) block;
-        idx  = getid( node[L2_IDX(req->vaddr)] );
-        free(block);
-        if ( idx == ZERO ) {
-            req->state = RETURN_ZERO;
-            block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
-        } else {
-            req->state = READ_L3;
-            block_read(idx, read_cb, req);
-        }
-        break;
-
-    case READ_L3:
-    {
-        struct block_info *bi;
-
-        DPRINTF("READ_L3\n");
-        block = IO_BLOCK(ret);
-        if (block == NULL) goto fail;
-        node = (radix_tree_node) block;
-        idx  = getid( node[L3_IDX(req->vaddr)] );
-        bi = (struct block_info *) &node[L3_IDX(req->vaddr) + 1];
-        req->bi = *bi;
-        free(block);
-        if ( idx == ZERO )  {
-            req->state = RETURN_ZERO;
-            block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
-        } else {
-            req->state = READ_DATA;
-            block_read(idx, read_cb, req);
-        }
-        break;
-    }
-    case READ_DATA:
-    {
-        u32 crc;
-
-        DPRINTF("READ_DATA\n");
-        block = IO_BLOCK(ret);
-        if (block == NULL) goto fail;
-
-        /* crc check */
-        crc = (u32) crc32(0L, Z_NULL, 0); 
-        crc = (u32) crc32(crc, block, BLOCK_SIZE); 
-        if (crc != req->bi.crc) {
-            /* TODO: add a retry loop here.                          */
-            /* Do this after the cache is added -- make sure to      */
-            /* invalidate the bad page before reissuing the read.    */
-
-            warn("Bad CRC on vaddr (%Lu:%d)\n", req->vaddr, req->bi.unused);
-#ifdef PRINT_BADCRC_PAGES
-            {
-                int j;
-                for (j=0; j<BLOCK_SIZE; j++) {
-                    if isprint(block[j]) {
-                        printf("%c", block[j]);
-                    } else {
-                        printf(".");
-                    }
-                    if ((j % 64) == 0) printf("\n");
-                }
-            }
-#endif /* PRINT_BADCRC_PAGES */
-
-            /* fast and loose for the moment. */
-            /* goto fail;                     */
-        }
-
-        req->retval = ret;
-        req->state = READ_UNLOCKED;
-        block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
-        break;
-    }
-    case READ_UNLOCKED:
-    {
-        struct io_ret r;
-        io_cb_t cb;
-        DPRINTF("READ_UNLOCKED\n");
-        req_param = req->param;
-        r         = req->retval;
-        cb        = req->cb;
-        free(req);
-        cb(r, req_param);
-        break;
-    }
-    
-    case RETURN_ZERO:
-    {
-        struct io_ret r;
-        io_cb_t cb;
-        DPRINTF("RETURN_ZERO\n");
-        req_param = req->param;
-        cb        = req->cb;
-        free(req);
-        r.type = IO_BLOCK_T;
-        r.u.b = newblock();
-        cb(r, req_param);
-        break;
-    }
-        
-    default:
-       DPRINTF("*** Write: Bad state! (%d) ***\n", req->state);
-       goto fail;
-    }
- 
-    return;
-
- fail:
-    {
-        struct io_ret r;
-        io_cb_t cb;
-        DPRINTF("asyn_read had a read error.\n");
-        req_param = req->param;
-        r         = ret;
-        cb        = req->cb;
-        free(req);
-        cb(r, req_param);
-    }
-
-
-}
-
-static void write_cb(struct io_ret r, void *param)
-{
-    struct io_req *req = (struct io_req *)param;
-    radix_tree_node node;
-    u64 a, addr;
-    void *req_param;
-    struct block_info *bi;
-
-    switch(req->state) {
-       
-    case WRITE_LOCKED:
-        
-        DPRINTF("WRITE_LOCKED (%llu)\n", L1_IDX(req->vaddr));
-       req->state = READ_L1;
-       block_read(getid(req->root), write_cb, req); 
-       break;
-       
-    case READ_L1: /* block is the radix root */
-
-        DPRINTF("READ_L1\n");
-        node = (radix_tree_node) IO_BLOCK(r);
-        if (node == NULL) goto fail;
-        a    = node[L1_IDX(req->vaddr)];
-        addr = getid(a);
-
-        req->radix_addr[L2] = addr;
-        req->radix[L1] = node;
-
-        if ( addr == ZERO ) {
-            /* L1 empty subtree: */
-            req->state = ALLOC_DATA_L1z;
-            block_alloc( req->block, write_cb, req );
-        } else if ( !iswritable(a) ) {
-            /* L1 fault: */
-            req->state = READ_L2_L1f;
-            block_read( addr, write_cb, req );
-        } else {
-            req->state = READ_L2;
-            block_read( addr, write_cb, req );
-        }
-        break;
-    
-    case READ_L2:
-
-        DPRINTF("READ_L2\n");
-        node = (radix_tree_node) IO_BLOCK(r);
-        if (node == NULL) goto fail;
-        a    = node[L2_IDX(req->vaddr)];
-        addr = getid(a);
-
-        req->radix_addr[L3] = addr;
-        req->radix[L2] = node;
-
-        if ( addr == ZERO ) {
-            /* L2 empty subtree: */
-            req->state = ALLOC_DATA_L2z;
-            block_alloc( req->block, write_cb, req );
-        } else if ( !iswritable(a) ) {
-            /* L2 fault: */
-            req->state = READ_L3_L2f;
-            block_read( addr, write_cb, req );
-        } else {
-            req->state = READ_L3;
-            block_read( addr, write_cb, req );
-        }
-        break;
-    
-    case READ_L3:
-
-        DPRINTF("READ_L3\n");
-        node = (radix_tree_node) IO_BLOCK(r);
-        if (node == NULL) goto fail;
-        a    = node[L3_IDX(req->vaddr)];
-        addr = getid(a);
-
-        req->radix[L3] = node;
-
-        if ( addr == ZERO ) {
-            /* L3 fault: */
-            req->state = ALLOC_DATA_L3z;
-            block_alloc( req->block, write_cb, req );
-        } else if ( !iswritable(a) ) {
-            /* L3 fault: */
-            req->state = ALLOC_DATA_L3f;
-            block_alloc( req->block, write_cb, req );
-        } else {
-            req->state = WRITE_DATA;
-            block_write( addr, req->block, write_cb, req );
-        }
-        break;
-    
-    case WRITE_DATA:
-
-        DPRINTF("WRITE_DATA\n");
-        /* The L3 radix points to the correct block, we just need to  */
-        /* update the crc.                                            */
-        if (IO_INT(r) < 0) goto fail;
-        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
-        req->bi.unused = 101;
-        *bi = req->bi;
-        req->state = WRITE_L3;
-        block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req);
-        break;
-    
-    /* L3 Zero Path: */
-
-    case ALLOC_DATA_L3z:
-
-        DPRINTF("ALLOC_DATA_L3z\n");
-        addr = IO_ADDR(r);
-        a = writable(addr);
-        req->radix[L3][L3_IDX(req->vaddr)] = a;
-        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
-        req->bi.unused = 102;
-        *bi = req->bi;
-        req->state = WRITE_L3_L3z;
-        block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req);
-        break;
-    
-    /* L3 Fault Path: */
-
-    case ALLOC_DATA_L3f:
-    
-        DPRINTF("ALLOC_DATA_L3f\n");
-        addr = IO_ADDR(r);
-        a = writable(addr);
-        req->radix[L3][L3_IDX(req->vaddr)] = a;
-        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
-        req->bi.unused = 103;
-        *bi = req->bi;
-        req->state = WRITE_L3_L3f;
-        block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req);
-        break;
-
-    /* L2 Zero Path: */
-        
-    case ALLOC_DATA_L2z:
-
-        DPRINTF("ALLOC_DATA_L2z\n");
-        addr = IO_ADDR(r);
-        a = writable(addr);
-        req->radix[L3] = newblock();
-        req->radix[L3][L3_IDX(req->vaddr)] = a;
-        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
-        req->bi.unused = 104;
-        *bi = req->bi;
-        req->state = ALLOC_L3_L2z;
-        block_alloc( (char*)req->radix[L3], write_cb, req );
-        break;
-
-    case ALLOC_L3_L2z:
-
-        DPRINTF("ALLOC_L3_L2z\n");
-        addr = IO_ADDR(r);
-        a = writable(addr);
-        req->radix[L2][L2_IDX(req->vaddr)] = a;
-        req->state = WRITE_L2_L2z;
-        block_write(req->radix_addr[L2], (char*)req->radix[L2], write_cb, req);
-        break;
-        
-    /* L2 Fault Path: */
-        
-    case READ_L3_L2f:
-    
-       DPRINTF("READ_L3_L2f\n");
-        node = (radix_tree_node) IO_BLOCK(r);
-        clear_L3_w_bits(node);
-        if (node == NULL) goto fail;
-        a    = node[L2_IDX(req->vaddr)];
-        addr = getid(a);
-
-        req->radix[L3] = node;
-        req->state = ALLOC_DATA_L2f;
-        block_alloc( req->block, write_cb, req );
-        break;
-                
-    case ALLOC_DATA_L2f:
-
-        DPRINTF("ALLOC_DATA_L2f\n");
-        addr = IO_ADDR(r);
-        a = writable(addr);
-        req->radix[L3][L3_IDX(req->vaddr)] = a;
-        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
-        req->bi.unused = 105;
-        *bi = req->bi;
-        req->state = ALLOC_L3_L2f;
-        block_alloc( (char*)req->radix[L3], write_cb, req );
-        break;
-
-    case ALLOC_L3_L2f:
-
-        DPRINTF("ALLOC_L3_L2f\n");
-        addr = IO_ADDR(r);
-        a = writable(addr);
-        req->radix[L2][L2_IDX(req->vaddr)] = a;
-        req->state = WRITE_L2_L2f;
-        block_write(req->radix_addr[L2], (char*)req->radix[L2], write_cb, req);
-        break;
-        
-    /* L1 Zero Path: */
-    
-    case ALLOC_DATA_L1z:
-
-        DPRINTF("ALLOC_DATA_L1z\n");
-        addr = IO_ADDR(r);
-        a = writable(addr);
-        req->radix[L3] = newblock();
-        req->radix[L3][L3_IDX(req->vaddr)] = a;
-        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
-        req->bi.unused = 106;
-        *bi = req->bi;
-        req->state = ALLOC_L3_L1z;
-        block_alloc( (char*)req->radix[L3], write_cb, req );
-        break;
-        
-    case ALLOC_L3_L1z:
-
-        DPRINTF("ALLOC_L3_L1z\n");
-        addr = IO_ADDR(r);
-        a = writable(addr);
-        req->radix[L2] = newblock();
-        req->radix[L2][L2_IDX(req->vaddr)] = a;
-        req->state = ALLOC_L2_L1z;
-        block_alloc( (char*)req->radix[L2], write_cb, req );
-        break;
-
-    case ALLOC_L2_L1z:
-
-        DPRINTF("ALLOC_L2_L1z\n");
-        addr = IO_ADDR(r);
-        a = writable(addr);
-        req->radix[L1][L1_IDX(req->vaddr)] = a;
-        req->state = WRITE_L1_L1z;
-        block_write(req->radix_addr[L1], (char*)req->radix[L1], write_cb, req);
-        break;
-
-    /* L1 Fault Path: */
-        
-    case READ_L2_L1f:
-    
-       DPRINTF("READ_L2_L1f\n");
-        node = (radix_tree_node) IO_BLOCK(r);
-        clear_w_bits(node);
-        if (node == NULL) goto fail;
-        a    = node[L2_IDX(req->vaddr)];
-        addr = getid(a);
-
-        req->radix_addr[L3] = addr;
-        req->radix[L2] = node;
-        
-        if (addr == ZERO) {
-            /* nothing below L2, create an empty L3 and alloc data. */
-            /* (So skip READ_L3_L1f.) */
-            req->radix[L3] = newblock();
-            req->state = ALLOC_DATA_L1f;
-            block_alloc( req->block, write_cb, req );
-        } else {
-            req->state = READ_L3_L1f;
-            block_read( addr, write_cb, req );
-        }
-        break;
-        
-    case READ_L3_L1f:
-    
-       DPRINTF("READ_L3_L1f\n");
-        node = (radix_tree_node) IO_BLOCK(r);
-        clear_L3_w_bits(node);
-        if (node == NULL) goto fail;
-        a    = node[L2_IDX(req->vaddr)];
-        addr = getid(a);
-
-        req->radix[L3] = node;
-        req->state = ALLOC_DATA_L1f;
-        block_alloc( req->block, write_cb, req );
-        break;
-                
-    case ALLOC_DATA_L1f:
-
-        DPRINTF("ALLOC_DATA_L1f\n");
-        addr = IO_ADDR(r);
-        a = writable(addr);
-        req->radix[L3][L3_IDX(req->vaddr)] = a;
-        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
-        req->bi.unused = 107;
-        *bi = req->bi;
-        req->state = ALLOC_L3_L1f;
-        block_alloc( (char*)req->radix[L3], write_cb, req );
-        break;
-
-    case ALLOC_L3_L1f:
-
-        DPRINTF("ALLOC_L3_L1f\n");
-        addr = IO_ADDR(r);
-        a = writable(addr);
-        req->radix[L2][L2_IDX(req->vaddr)] = a;
-        req->state = ALLOC_L2_L1f;
-        block_alloc( (char*)req->radix[L2], write_cb, req );
-        break;
-
-    case ALLOC_L2_L1f:
-
-        DPRINTF("ALLOC_L2_L1f\n");
-        addr = IO_ADDR(r);
-        a = writable(addr);
-        req->radix[L1][L1_IDX(req->vaddr)] = a;
-        req->state = WRITE_L1_L1f;
-        block_write(req->radix_addr[L1], (char*)req->radix[L1], write_cb, req);
-        break;
-
-    case WRITE_L3:
-    case WRITE_L3_L3z:
-    case WRITE_L3_L3f:
-    case WRITE_L2_L2z:
-    case WRITE_L2_L2f:
-    case WRITE_L1_L1z:
-    case WRITE_L1_L1f:
-    {
-       int i;
-        DPRINTF("DONE\n");
-        /* free any saved node vals. */
-        for (i=0; i<3; i++)
-            if (req->radix[i] != 0) free(req->radix[i]);
-        req->retval = r;
-        req->state = WRITE_UNLOCKED;
-        block_wunlock(req->lock, L1_IDX(req->vaddr), write_cb, req);
-        break;
-    }
-    case WRITE_UNLOCKED:
-    {
-        struct io_ret r;
-        io_cb_t cb;
-        DPRINTF("WRITE_UNLOCKED!\n");
-        req_param = req->param;
-        r         = req->retval;
-        cb        = req->cb;
-        free(req);
-        cb(r, req_param);
-        break;
-    }
-        
-    default:
-       DPRINTF("*** Write: Bad state! (%d) ***\n", req->state);
-       goto fail;
-    }
-    
-    return;
-    
- fail:
-    {
-        struct io_ret r;
-        io_cb_t cb;
-        int i;
-
-        DPRINTF("asyn_write had a read error mid-way.\n");
-        req_param = req->param;
-        cb        = req->cb;
-        r.type = IO_INT_T;
-        r.u.i  = -1;
-        /* free any saved node vals. */
-        for (i=0; i<3; i++)
-            if (req->radix[i] != 0) free(req->radix[i]);
-        free(req);
-        cb(r, req_param);
-    }
-}
-
-char *vdi_read_s(vdi_t *vdi, u64 vaddr)
-{
-    pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
-    char *block = NULL;
-    int ret;
-
-    void reads_cb(struct io_ret r, void *param) 
-    {
-        block = IO_BLOCK(r);
-        pthread_mutex_unlock((pthread_mutex_t *)param);
-    }
-
-    pthread_mutex_lock(&m);
-    ret = vdi_read(vdi, vaddr, reads_cb, &m);
-
-    if (ret == 0) pthread_mutex_lock(&m);
-    
-    return block;
-}
-
-
-int vdi_write_s(vdi_t *vdi, u64 vaddr, char *block)
-{
-    pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
-    int ret, result;
-
-    void writes_cb(struct io_ret r, void *param) 
-    {
-        result = IO_INT(r);
-        pthread_mutex_unlock((pthread_mutex_t *)param);
-    }
-
-    pthread_mutex_lock(&m);
-    ret = vdi_write(vdi, vaddr, block, writes_cb, &m);
-
-    if (ret == 0) pthread_mutex_lock(&m);
-    
-    return result;
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/requests-async.h
--- a/tools/blktap/requests-async.h     Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,29 +0,0 @@
-#ifndef _REQUESTSASYNC_H_
-#define _REQUESTSASYNC_H_
-
-#include "block-async.h"
-#include "blockstore.h" /* for newblock etc. */
-
-/*
-#define BLOCK_SIZE 4096
-#define ZERO 0ULL
-#define getid(x) (((x)>>1)&0x7fffffffffffffffLLU)
-#define iswritable(x) (((x) & 1LLU) != 0)
-#define writable(x) (((x) << 1) | 1LLU)
-#define readonly(x) ((u64)((x) << 1))
-*/
-
-#define VADDR_MASK 0x0000000003ffffffLLU /* 26-bits = 256Gig */
-#define VALID_VADDR(x) (((x) & VADDR_MASK) == (x))
-
-int vdi_read (vdi_t *vdi, u64 vaddr, io_cb_t cb, void *param);
-int vdi_write(vdi_t *vdi, u64 vaddr, char *block, io_cb_t cb, void *param);
-             
-/* synchronous versions: */
-char *vdi_read_s (vdi_t *vdi, u64 vaddr);
-int   vdi_write_s(vdi_t *vdi, u64 vaddr, char *block);
-
-#define ERR_BAD_VADDR  -1
-#define ERR_NOMEM      -2
-
-#endif //_REQUESTSASYNC_H_
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/snaplog.c
--- a/tools/blktap/snaplog.c    Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,238 +0,0 @@
-/**************************************************************************
- * 
- * snaplog.c
- *
- * Snapshot log on-disk data structure.
- *
- */
- 
- /* VDI histories are made from chains of snapshot logs.  These logs record 
-  * the (radix) root and timestamp of individual snapshots.
-  *
-  * creation of a new VDI involves 'forking' a snapshot log, by creating a 
-  * new, empty log (in a new VDI) and parenting it off of a record in an 
-  * existing snapshot log.
-  *
-  * snapshot log blocks have at most one writer.
-  */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/time.h>
-#include "blockstore.h"
-#include "snaplog.h"
-
-
-
-snap_block_t *snap_get_block(u64 block)
-{
-    snap_block_t *blk = (snap_block_t *)readblock(block);
-    
-    if ( blk == NULL)
-        return NULL;
-    if ( blk->hdr.magic != SNAP_MAGIC ) {
-        freeblock(blk);
-        return NULL;
-    }
-    
-    return blk;
-}
-    
-int snap_get_id(snap_id_t *id, snap_rec_t *target)
-{
-    snap_block_t *blk;
-    
-    if ( id == NULL )
-        return -1;
-    
-    blk = snap_get_block(id->block);
-    
-    if ( blk == NULL ) 
-        return -1;
-    
-    if ( id->index > blk->hdr.nr_entries ) {
-        freeblock(blk);
-        return -1;
-    }
-    
-    *target = blk->snaps[id->index];
-    freeblock(blk);
-    return 0;
-}
-
-int __snap_block_create(snap_id_t *parent_id, snap_id_t *fork_id,
-                                  snap_id_t *new_id)
-{
-    snap_rec_t parent_rec, fork_rec;
-    snap_block_t *blk, *pblk;
-    /*
-    if ( (parent_id != NULL) && (snap_get_id(parent_id, &parent_rec) != 0) )
-        return -1;    
-    
-    if ( (fork_id != NULL) && (snap_get_id(fork_id, &fork_rec) != 0) )
-        return -1;   
-*/
-    blk = (snap_block_t *)newblock();
-    blk->hdr.magic  = SNAP_MAGIC;
-    blk->hdr.nr_entries  = 0;
-    blk->hdr.log_entries = 0;
-    blk->hdr.immutable   = 0;
-    
-    if (   (parent_id  != NULL) 
-        && (parent_id->block != fork_id->block) 
-        && (parent_id->block != 0)) {
-        
-        pblk = snap_get_block(parent_id->block);
-        blk->hdr.log_entries = pblk->hdr.log_entries;
-        freeblock(pblk);
-    }
-    
-    if (parent_id != NULL) {
-        blk->hdr.parent_block = *parent_id;
-        blk->hdr.fork_block   = *fork_id;
-    } else {
-        blk->hdr.parent_block = null_snap_id;
-        blk->hdr.fork_block   = null_snap_id;
-    }
-    
-    new_id->index = 0;
-    new_id->block = allocblock(blk);
-    freeblock(blk);
-    if (new_id->block == 0)
-        return -1;
-    
-    return 0;
-}
-
-int snap_block_create(snap_id_t *parent_id, snap_id_t *new_id)
-{
-    return __snap_block_create(parent_id, parent_id, new_id);
-}
-
-int snap_append(snap_id_t *old_id, snap_rec_t *rec, snap_id_t *new_id)
-{
-    snap_id_t id = *old_id;
-    snap_block_t *blk = snap_get_block(id.block);
-    
-    if ( rec->deleted == 1 ) {
-        printf("Attempt to append a deleted snapshot!\n");
-        return -1;
-    }
-    
-    if ( blk->hdr.immutable != 0 ) {
-        printf("Attempt to snap an immutable snap block!\n");
-        return -1;
-    }
-    
-    new_id->block = id.block;
-    
-    if (blk->hdr.nr_entries == SNAPS_PER_BLOCK) {
-        int ret;
-        
-        id.index--; /* make id point to the last full record */
-        
-        ret = __snap_block_create(&id, &blk->hdr.fork_block, new_id);
-        if ( ret != 0 ) {
-            freeblock(blk);
-            return -1;
-        }
-        
-        blk->hdr.immutable = 1;
-        writeblock(id.block, blk);
-        freeblock(blk);
-        blk = snap_get_block(new_id->block);
-        id = *new_id;
-    }
-    
-    blk->snaps[blk->hdr.nr_entries] = *rec;
-    blk->hdr.nr_entries++;
-    blk->hdr.log_entries++;
-    new_id->index = blk->hdr.nr_entries;
-    //printf("snap: %u %u\n", blk->hdr.nr_entries, blk->hdr.log_entries);
-    writeblock(id.block, blk);
-    freeblock(blk);
-    return 0;
-}
-
-int snap_collapse(int height, snap_id_t *p_id, snap_id_t *c_id)
-{
-    snap_block_t *p_blk, *c_blk, *blk;
-    snap_rec_t   *p_rec, *c_rec;
-    int ret = -1;
-    
-    p_blk = snap_get_block(p_id->block);
-    
-    if (p_blk == NULL) return(-1);
-    
-    if (c_id->block == p_id->block)
-    {
-        c_blk = p_blk;
-    } else {
-         c_blk = snap_get_block(c_id->block);
-    }
-    
-    if (p_blk == NULL) {
-        freeblock(p_blk);
-        return(-1);
-    }
-     
-    /* parent and child must not be deleted. */
-    p_rec = &p_blk->snaps[p_id->index];
-    c_rec = &c_blk->snaps[c_id->index];
-    /*
-    if ( (p_rec->deleted == 1) || (c_rec->deleted == 1) ) {
-        printf("One of those snaps is already deleted.\n");
-        goto done;
-    }
-    */
-    /* first non-deleted thing in the log before child must be parent. */
-    
-    /* XXX todo: text the range here for delete (and eventually fork) bits) */
-    /* for now, snaps must be consecutive, on the same log page: */
-    
-    if ((p_id->block != c_id->block) || (p_id->index != c_id->index-1))
-    {
-        printf("Deleting non-consecutive snaps is not done yet.\n");
-        goto done;
-    }
-    
-    /* mark parent as deleted XXX: may need to lock parent block here.*/
-    p_rec->deleted = 1;
-    writeblock(p_id->block, p_blk);
-    
-    /* delete the parent */
-    printf("collapse(%Ld, %Ld)\n", p_rec->radix_root, c_rec->radix_root);
-    ret = collapse(height, p_rec->radix_root, c_rec->radix_root);
-    
-    /* return the number of blocks reclaimed. */
-    
-done:
-    if (c_blk != p_blk) freeblock(c_blk);
-    freeblock(p_blk);
-    
-    return(ret);
-}
-
-void snap_print_history(snap_id_t *snap_id)
-{
-    snap_id_t id = *snap_id;
-    unsigned int idx = id.index;
-    snap_block_t *new_blk, *blk = snap_get_block(id.block);
-    
-    while ( blk ) {
-        printf("[Snap block %Ld]:\n", id.block);
-        do {
-            printf("   %03u: root: %Ld ts: %ld.%ld\n", idx, 
-                    blk->snaps[idx].radix_root,
-                    blk->snaps[idx].timestamp.tv_sec,
-                    blk->snaps[idx].timestamp.tv_usec);
-        } while (idx-- != 0);
-        
-        id = blk->hdr.parent_block;
-        if (id.block != 0) {
-            new_blk = snap_get_block(id.block);
-        }
-        freeblock(blk);
-        blk = new_blk;
-    }
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/snaplog.h
--- a/tools/blktap/snaplog.h    Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,61 +0,0 @@
-/**************************************************************************
- * 
- * snaplog.h
- *
- * Snapshot log on-disk data structure.
- *
- */
- 
-#include "radix.h"
-#include "blockstore.h"    /* for BLOCK_SIZE */
- 
-#ifndef __SNAPLOG_H__
-#define __SNAPLOG_H__
-
-typedef struct snap_id {
-    u64            block;
-    unsigned int   index;
-} snap_id_t;
-
-typedef struct snap_rec {
-    u64            radix_root;
-    struct timeval timestamp;
-    /* flags: */
-    unsigned       deleted:1;
-} snap_rec_t;
-
-
-int  snap_block_create(snap_id_t *parent_id, snap_id_t *new_id);
-int  snap_append(snap_id_t *id, snap_rec_t *rec, snap_id_t *new_id);
-int  snap_collapse(int height, snap_id_t *p_id, snap_id_t *c_id);
-void snap_print_history(snap_id_t *snap_id);
-int  snap_get_id(snap_id_t *id, snap_rec_t *target);
-
-
-/* exported for vdi debugging */
-#define SNAP_MAGIC 0xff00ff0aa0ff00ffLL
-
-static const snap_id_t null_snap_id = { 0, 0 }; 
-
-typedef struct snap_block_hdr {
-    u64            magic;
-    snap_id_t      parent_block; /* parent block within this chain */
-    snap_id_t      fork_block;   /* where this log was forked */
-    unsigned       log_entries;  /* total entries since forking */
-    unsigned short nr_entries;   /* entries in snaps[] */
-    unsigned short immutable;    /* has this snap page become immutable? */
-} snap_block_hdr_t;
-
-
-#define SNAPS_PER_BLOCK \
-    ((BLOCK_SIZE - sizeof(snap_block_hdr_t)) / sizeof(snap_rec_t))
-
-typedef struct snap_block {
-    snap_block_hdr_t hdr;
-    snap_rec_t       snaps[SNAPS_PER_BLOCK];
-} snap_block_t;
-    
-
-snap_block_t *snap_get_block(u64 block);
-
-#endif /* __SNAPLOG_H__ */
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi.c
--- a/tools/blktap/vdi.c        Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,367 +0,0 @@
-/**************************************************************************
- * 
- * vdi.c
- *
- * Virtual Disk Image (VDI) Interfaces
- *
- */
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include <fcntl.h>
-#include <string.h>
-#include <sys/time.h>
-#include <pthread.h>
-#include "blockstore.h"
-#include "block-async.h"
-#include "requests-async.h"
-#include "radix.h"
-#include "vdi.h"
-                    
-#define VDI_REG_BLOCK   2LL
-#define VDI_RADIX_ROOT  writable(3)
-                                                            
-#if 0
-#define DPRINTF(_f, _a...) printf ( _f , ## _a )
-#else
-#define DPRINTF(_f, _a...) ((void)0)
-#endif
-
-/* I haven't decided about this registry stuff, so this is just a really
- * quick lash-up so that there is some way to track VDIs.
- *
- * (Most vdi access should be with a direct handle to the block, so this
- *  registry is just for start-of-day lookup and other control operations.)
- */
-
-vdi_registry_t *create_vdi_registry(void)
-{
-    vdi_registry_t *reg = (vdi_registry_t *)newblock();
-    
-    if (reg == NULL)
-        return NULL;
-    
-    /* zero-fill the vdi radix root while we have an empty block. */
-    writeblock(VDI_RADIX_ROOT, (void *)reg);
-    
-    
-    DPRINTF("[vdi.c] Creating VDI registry!\n");
-    reg->magic      = VDI_REG_MAGIC;
-    reg->nr_vdis    = 0;
-    
-    writeblock(VDI_REG_BLOCK, (void *)reg);
-    
-    return reg;
-}
-    
-vdi_registry_t *get_vdi_registry(void)
-{
-    vdi_registry_t *vdi_reg = (vdi_registry_t *)readblock(VDI_REG_BLOCK);
-    
-    if ( vdi_reg == NULL )
-        vdi_reg = create_vdi_registry();
-    
-    if ( vdi_reg->magic != VDI_REG_MAGIC ) {
-        freeblock(vdi_reg);
-        return NULL;
-    }
-    
-    return vdi_reg;
-}
-
-
-vdi_t *vdi_create(snap_id_t *parent_snap, char *name)
-{
-    int ret;
-    vdi_t *vdi;
-    vdi_registry_t *vdi_reg;
-    snap_rec_t snap_rec;
-    
-    /* create a vdi struct */
-    vdi = newblock();
-    if (vdi == NULL) 
-        return NULL;
-    
-    if ( snap_get_id(parent_snap, &snap_rec) == 0 ) {
-        vdi->radix_root = snapshot(snap_rec.radix_root);
-    } else {
-        vdi->radix_root = allocblock((void *)vdi); /* vdi is just zeros here */
-        vdi->radix_root = writable(vdi->radix_root); /* grr. */
-    }
-    
-    /* create a snapshot log, and add it to the vdi struct */
-    
-    ret = snap_block_create(parent_snap, &vdi->snap);
-    if ( ret != 0 ) {
-        DPRINTF("Error getting snap block in vdi_create.\n");
-        freeblock(vdi);
-        return NULL;
-    }
-            
-    /* append the vdi to the registry, fill block and id.             */
-    /* implicit allocation means we have to write the vdi twice here. */
-    vdi_reg    = get_vdi_registry();
-    if ( vdi_reg == NULL ) {
-        freeblock(vdi);
-        return NULL;
-    }
-    
-    vdi->block = allocblock((void *)vdi);
-    vdi->id    = vdi_reg->nr_vdis++;
-    strncpy(vdi->name, name, VDI_NAME_SZ);
-    vdi->name[VDI_NAME_SZ] = '\0';
-    vdi->radix_lock = NULL; /* for tidiness */
-    writeblock(vdi->block, (void *)vdi);
-    
-    update(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi->id, vdi->block);
-    writeblock(VDI_REG_BLOCK, (void *)vdi_reg);
-    freeblock(vdi_reg);
-    
-    vdi->radix_lock = (struct radix_lock *)malloc(sizeof(struct radix_lock));
-    if (vdi->radix_lock == NULL) 
-    {
-       perror("couldn't malloc radix_lock for new vdi!");
-       freeblock(vdi);
-       return NULL;
-    }
-    radix_lock_init(vdi->radix_lock);
-    
-    return vdi;
-}
-
-/* vdi_get and vdi_put currently act more like alloc/free -- they don't 
- * do refcount-based allocation.  
- */
-vdi_t *vdi_get(u64 vdi_id)
-{
-    u64 vdi_blk;
-    vdi_t *vdi;
-    
-    vdi_blk = lookup(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi_id);
-    
-    if ( vdi_blk == 0 )
-        return NULL;
-    
-    vdi = (vdi_t *)readblock(vdi_blk);
-    
-    vdi->radix_lock = (struct radix_lock *)malloc(sizeof(struct radix_lock));
-    if (vdi->radix_lock == NULL) 
-    {
-       perror("couldn't malloc radix_lock for new vdi!");
-       freeblock(vdi);
-       return NULL;
-    }
-    radix_lock_init(vdi->radix_lock);
-    
-    return vdi;
-}
-
-void vdi_put(vdi_t *vdi)
-{
-    free(vdi->radix_lock);
-    freeblock(vdi);
-}
-
-void vdi_snapshot(vdi_t *vdi)
-{
-    snap_rec_t rec;
-    int ret;
-    
-    rec.radix_root = vdi->radix_root;
-    gettimeofday(&rec.timestamp, NULL);
-    rec.deleted = 0;
-    
-    vdi->radix_root = snapshot(vdi->radix_root);
-    ret = snap_append(&vdi->snap, &rec, &vdi->snap);
-    if ( ret != 0 ) {
-        printf("snap_append returned failure\n");
-        return;
-    }
-    writeblock(vdi->block, vdi);
-}
-    
-int __init_vdi()
-{
-    /* sneak this in here for the moment. */
-    __rcache_init();
-    
-    /* force the registry to be created if it doesn't exist. */
-    vdi_registry_t *vdi_reg = get_vdi_registry();
-    if (vdi_reg == NULL) {
-        printf("[vdi.c] Couldn't get/create a VDI registry!\n");
-        return -1;
-    }
-    freeblock(vdi_reg);
-    
-    
-    return 0;
-}
-    
-#ifdef VDI_STANDALONE
-
-#define TEST_VDIS      50
-#define NR_ITERS    50000
-#define FORK_POINTS   200
-#define INIT_VDIS       3
-#define INIT_SNAPS     40
-
-/* These must be of decreasing size: */
-#define NEW_FORK       (RAND_MAX-(RAND_MAX/1000))
-#define NEW_ROOT_VDI   (RAND_MAX-((RAND_MAX/1000)*2))
-#define NEW_FORK_VDI   (RAND_MAX-((RAND_MAX/1000)*3))
-
-#define GRAPH_DOT_FILE "vdi.dot"
-#define GRAPH_PS_FILE  "vdi.ps"
-
-
-typedef struct sh_st {
-    snap_id_t     id;
-    struct sh_st *next;
-} sh_t;
-
-#define SNAP_HASHSZ 1024
-sh_t *node_hash[SNAP_HASHSZ];
-#define SNAP_HASH(_id) (((int)(_id)->block^(_id)->index)%SNAP_HASHSZ)
-
-#define SNAPID_EQUAL(_a,_b) \
-    (((_a)->block==(_b)->block) && ((_a)->index==(_b)->index))
-int sh_check_and_add(snap_id_t *id)
-{
-    sh_t **s = &node_hash[SNAP_HASH(id)];
-    
-    while (*s != NULL) {
-        if (SNAPID_EQUAL(&((*s)->id), id))
-            return 1;
-        *s = (*s)->next;
-    }
-    
-    *s = (sh_t *)malloc(sizeof(sh_t));
-    (*s)->id = *id;
-    (*s)->next = NULL;
-    
-    return 0;
-}
-
-int main(int argc, char *argv[])
-{
-    vdi_t *vdi_list[TEST_VDIS];
-    snap_id_t id, fork_points[FORK_POINTS];
-    int nr_vdis = 0, nr_forks = 0;
-    int i, j, r;
-    FILE *f;
-    char name[VDI_NAME_SZ];
-    
-    __init_blockstore();
-    __init_vdi();
-    
-    printf("[o] Generating seed VDIs. (%d VDIs)\n", INIT_VDIS);
-    
-    for (i=0; i<INIT_VDIS; i++) {
-        r=rand();
-        
-        sprintf(name, "VDI Number %d", nr_vdis);
-        vdi_list[i] = vdi_create(NULL, name);
-        for (j=0; j<(r%INIT_SNAPS); j++)
-            vdi_snapshot(vdi_list[i]);
-        fork_points[i] = vdi_list[i]->snap;
-        nr_vdis++;
-        nr_forks++;
-    }
-    
-    printf("[o] Running a random workload. (%d iterations)\n", NR_ITERS);
-            
-    for (i=0; i<NR_ITERS; i++) {
-        r = rand();
-        
-        if ( r > NEW_FORK ) {
-            if ( nr_forks > FORK_POINTS )
-                continue;
-            id = vdi_list[r%nr_vdis]->snap;
-            if ( ( id.block == 0 ) || ( id.index == 0 ) )
-                continue;
-            id.index--;
-            fork_points[nr_forks++] = id;
-            
-        } else if ( r > NEW_ROOT_VDI ) {
-            
-            if ( nr_vdis == TEST_VDIS )
-                continue;
-            
-            sprintf(name, "VDI Number %d.", nr_vdis);
-            vdi_list[nr_vdis++] = vdi_create(NULL, name);
-            
-        } else if ( r > NEW_FORK_VDI ) {
-            
-            if ( nr_vdis == TEST_VDIS )
-                continue;
-            
-            sprintf(name, "VDI Number %d.", nr_vdis);
-            vdi_list[nr_vdis++] = vdi_create(&fork_points[r%nr_forks], name);
-            
-        } else /* SNAPSHOT */ {
-            
-            vdi_snapshot(vdi_list[r%nr_vdis]);
-            
-        }
-    }
-    
-    /* now dump it out to a dot file. */
-    printf("[o] Dumping state to a dot graph. (%d VDIs)\n", nr_vdis);
-    
-    f = fopen(GRAPH_DOT_FILE, "w");
-    
-    /* write graph preamble */
-    fprintf(f, "digraph G {\n");
-    fprintf(f, "   rankdir=LR\n");
-    
-    for (i=0; i<nr_vdis; i++) {
-        char oldnode[255];
-        snap_block_t *blk;
-        snap_id_t id = vdi_list[i]->snap;
-        int nr_snaps, done=0;
-        
-        /* add a node for the id */
-printf("vdi: %d\n", i);
-        fprintf(f, "   n%Ld%d 
[color=blue,shape=box,label=\"%s\\nb:%Ld\\nidx:%d\"]\n", 
-                id.block, id.index, vdi_list[i]->name,
-                id.block, id.index);
-        sprintf(oldnode, "n%Ld%d", id.block, id.index);
-        
-        while (id.block != 0) {
-            blk = snap_get_block(id.block);
-            nr_snaps = blk->hdr.log_entries - (blk->hdr.nr_entries - id.index);
-            id = blk->hdr.fork_block;
-            
-            done = sh_check_and_add(&id);
-            
-            /* add a node for the fork_id */
-            if (!done) {
-                fprintf(f, "   n%Ld%d [shape=box,label=\"b:%Ld\\nidx:%d\"]\n", 
-                    id.block, id.index,
-                    id.block, id.index);
-            }
-            
-            /* add an edge between them */
-            fprintf(f, "   n%Ld%d -> %s [label=\"%u snapshots\"]\n",
-                    id.block, id.index, oldnode, nr_snaps);
-            sprintf(oldnode, "n%Ld%d", id.block, id.index);
-            freeblock(blk);
-            
-            if (done) break;
-        }
-    }
-    
-    /* write graph postamble */
-    fprintf(f, "}\n");
-    fclose(f);
-    
-    printf("[o] Generating postscript graph. (%s)\n", GRAPH_PS_FILE);
-    {
-        char cmd[255];
-        sprintf(cmd, "dot %s -Tps -o %s", GRAPH_DOT_FILE, GRAPH_PS_FILE);
-        system(cmd);
-    }
-    return 0;
-}
-
-#endif
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi.h
--- a/tools/blktap/vdi.h        Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,55 +0,0 @@
-#ifndef _VDI_H_
-#define _VDI_H_
-/**************************************************************************
- * 
- * vdi.h
- *
- * Virtual Disk Image (VDI) Interfaces
- *
- */
-
-#ifndef __VDI_H__
-#define __VDI_H__
-
-#include "blktaplib.h"
-#include "snaplog.h"
-
-#define VDI_HEIGHT     27 /* Note that these are now hard-coded */
-#define VDI_REG_HEIGHT 27 /* in the async lookup code           */
-
-#define VDI_NAME_SZ 256
-
-
-typedef struct vdi {
-    u64         id;               /* unique vdi id -- used by the registry   */
-    u64         block;            /* block where this vdi lives (also unique)*/
-    u64         radix_root;       /* radix root node for block mappings      */
-    snap_id_t   snap;             /* next snapshot slot for this VDI         */
-    struct vdi *next;             /* used to hash-chain in blkif.            */
-    blkif_vdev_t vdevice;         /* currently mounted as...                 */
-    struct radix_lock *radix_lock;/* per-line L1 RW lock for parallel reqs   */
-    char        name[VDI_NAME_SZ];/* human readable vdi name                 */
-} vdi_t;
-
-#define VDI_REG_MAGIC   0xff00ff0bb0ff00ffLL
-
-typedef struct vdi_registry {
-    u64     magic;
-    u64     nr_vdis;
-} vdi_registry_t;
-
-
-int __init_vdi(void);
-
-vdi_t *vdi_get(u64 vdi_id);
-void vdi_put(vdi_t *vdi);
-vdi_registry_t *get_vdi_registry(void);
-vdi_t *vdi_create(snap_id_t *parent_snap, char *name);
-u64 vdi_lookup_block(vdi_t *vdi, u64 vdi_block, int *writable);
-void vdi_update_block(vdi_t *vdi, u64 vdi_block, u64 g_block);
-void vdi_snapshot(vdi_t *vdi);
-
-
-#endif /* __VDI_H__ */
-
-#endif //_VDI_H_
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_create.c
--- a/tools/blktap/vdi_create.c Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,52 +0,0 @@
-/**************************************************************************
- * 
- * vdi_create.c
- *
- * Create a new vdi.
- *
- */
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include "blockstore.h"
-#include "radix.h"
-#include "vdi.h"
-
-int main(int argc, char *argv[])
-{
-    vdi_t       *vdi;
-    char         name[VDI_NAME_SZ] = "";
-    snap_id_t    id;
-    int          from_snap = 0;
-    
-    __init_blockstore();
-    __init_vdi();
-    
-    if ( argc == 1 ) {
-        printf("usage: %s <VDI Name> [<snap block> <snap idx>]\n", argv[0]);
-        exit(-1);
-    }
-    
-    strncpy( name, argv[1], VDI_NAME_SZ);
-    name[VDI_NAME_SZ] = '\0';    
-    
-    if ( argc > 3 ) {
-        id.block   = (u64)          atoll(argv[2]);
-        id.index   = (unsigned int) atol (argv[3]);
-        from_snap  = 1;
-    }
-    
-    vdi = vdi_create( from_snap ? &id : NULL, name);
-    
-    if ( vdi == NULL ) {
-        printf("Failed to create VDI!\n");
-        freeblock(vdi);
-        exit(-1);
-    }
-    
-    freeblock(vdi);
-    
-    return (0);
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_fill.c
--- a/tools/blktap/vdi_fill.c   Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,81 +0,0 @@
-/**************************************************************************
- * 
- * vdi_fill.c
- *
- * Hoover a file or device into a vdi.
- * You must first create the vdi with vdi_create.
- *
- */
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include "blockstore.h"
-#include "radix.h"
-#include "requests-async.h"
-#include "vdi.h"
-
-int main(int argc, char *argv[])
-{
-    vdi_t       *vdi;
-    u64          id;
-    int          fd;
-    struct stat  st;
-    u64          tot_size;
-    char         spage[BLOCK_SIZE];
-    char        *dpage;
-    u64          vblock = 0, count=0;
-    
-    __init_blockstore();
-    init_block_async();
-    __init_vdi();
-    
-    if ( argc < 3 ) {
-        printf("usage: %s <VDI id> <filename>\n", argv[0]);
-        exit(-1);
-    }
-        
-    id = (u64) atoll(argv[1]);
-    
-    vdi = vdi_get( id );
-    
-    if ( vdi == NULL ) {
-        printf("Failed to retreive VDI %Ld!\n", id);
-        exit(-1);
-    }
-    
-    fd = open(argv[2], O_RDONLY | O_LARGEFILE);
-    
-    if (fd < 0) {
-        printf("Couldn't open %s!\n", argv[2]);
-        exit(-1);
-    }
-    
-    if ( fstat(fd, &st) != 0 ) {
-        printf("Couldn't stat %s!\n", argv[2]);
-        exit(-1);
-    }
-    
-    tot_size = (u64) st.st_size;
-    printf("Filling VDI %Ld with %Ld bytes.\n", id, tot_size);
-    
-    printf("%011Ld blocks total\n", tot_size / BLOCK_SIZE);    
-    printf("           ");
-    while ( ( count = read(fd, spage, BLOCK_SIZE) ) > 0 ) {
-        vdi_write_s(vdi, vblock, spage);
-        
-        vblock++;
-        if ((vblock % 512) == 0)
-        printf("\b\b\b\b\b\b\b\b\b\b\b%011Ld", vblock);
-        fflush(stdout);
-    }
-    printf("\n");
-    
-    freeblock(vdi);
-    
-    return (0);
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_list.c
--- a/tools/blktap/vdi_list.c   Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,47 +0,0 @@
-/**************************************************************************
- * 
- * vdi_list.c
- *
- * Print a list of VDIs on the block store.
- *
- */
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include "blockstore.h"
-#include "radix.h"
-#include "vdi.h"
-
-int main(int argc, char *argv[])
-{
-    vdi_registry_t *reg;
-    vdi_t *vdi;
-    int i;
-    
-    __init_blockstore();
-    __init_vdi();
-    
-    reg = get_vdi_registry();
-    
-    if ( reg == NULL ) {
-        printf("couldn't get VDI registry.\n");
-        exit(-1);
-    }
-    
-    for (i=0; i < reg->nr_vdis; i++) {
-        vdi = vdi_get(i);
-        
-        if ( vdi != NULL ) {
-            
-            printf("%10Ld %60s\n", vdi->id, vdi->name);
-            freeblock(vdi);
-            
-        }
-    }
-    
-    freeblock(reg);
-    
-    return 0;
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_snap.c
--- a/tools/blktap/vdi_snap.c   Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,43 +0,0 @@
-/**************************************************************************
- * 
- * vdi_snap.c
- *
- * Snapshot a vdi.
- *
- */
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include "blockstore.h"
-#include "radix.h"
-#include "vdi.h"
-
-int main(int argc, char *argv[])
-{
-    vdi_t  *vdi;
-    u64     id;
-    
-    __init_blockstore();
-    __init_vdi();
-    
-    if ( argc == 1 ) {
-        printf("usage: %s <VDI id>\n", argv[0]);
-        exit(-1);
-    }
-    
-    id = (u64) atoll(argv[1]);
-    
-    vdi = vdi_get(id);
-    
-    if ( vdi == NULL ) {
-        printf("couldn't find the requested VDI.\n");
-        freeblock(vdi);
-        exit(-1);
-    }
-    
-    vdi_snapshot(vdi);
-    
-    return 0;
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_snap_delete.c
--- a/tools/blktap/vdi_snap_delete.c    Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,48 +0,0 @@
-/**************************************************************************
- * 
- * vdi_snap_delete.c
- *
- * Delete a snapshot.
- *
- * This is not finished:  right now it takes a snap n and calls 
- * snap_collapse(n,n+1).
- *
- * TODO: support for non-consecutive, non-same-block snaps
- *       Avoid forking probs.
- *
- */
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include "blockstore.h"
-#include "snaplog.h"
-#include "radix.h"
-#include "vdi.h"
-
-int main(int argc, char *argv[])
-{
-    snap_id_t    id, c_id;
-    int ret;
-    
-    __init_blockstore();
-    __init_vdi();
-    
-    if ( argc != 3 ) {
-        printf("usage: %s <snap block> <snap idx>\n", argv[0]);
-        exit(-1);
-    }
-    
-    id.block   = (u64)          atoll(argv[1]);
-    id.index   = (unsigned int) atol (argv[2]);
-    
-    c_id = id;
-    c_id.index++;
-    
-    ret = snap_collapse(VDI_HEIGHT, &id, &c_id);
-    
-    printf("Freed %d blocks.\n", ret);
-    
-    return 0;
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_snap_list.c
--- a/tools/blktap/vdi_snap_list.c      Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,82 +0,0 @@
-/**************************************************************************
- * 
- * vdi_snap_list.c
- *
- * Print a list of snapshots for the specified vdi.
- *
- */
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-#include <sys/time.h>
-#include "blockstore.h"
-#include "radix.h"
-#include "vdi.h"
-
-int main(int argc, char *argv[])
-{
-    vdi_t        *vdi;
-    u64           id;
-    int           i, max_snaps = -1;
-    snap_block_t *blk;
-    snap_id_t     sid;
-    char         *t;
-    
-    __init_blockstore();
-    __init_vdi();
-    
-    if ( argc == 1 ) {
-        printf("usage: %s <VDI id> [max snaps]\n", argv[0]);
-        exit(-1);
-    }
-    
-    id = (u64) atoll(argv[1]);
-    
-    if ( argc > 2 ) {
-        max_snaps = atoi(argv[2]);
-    }
-    
-    vdi = vdi_get(id);
-    
-    if ( vdi == NULL ) {
-        printf("couldn't find the requested VDI.\n");
-        freeblock(vdi);
-        exit(-1);
-    }
-    
-    sid = vdi->snap;
-    sid.index--;
-    
-    //printf("%8s%4s%21s %12s %1s\n", "Block", "idx", "timestamp", 
-    //    "radix root", "d");
-    printf("%8s%4s%37s %12s %1s\n", "Block", "idx", "timestamp", 
-            "radix root", "d");
-     
-    while (sid.block != 0) {
-        blk = snap_get_block(sid.block);
-        for (i = sid.index; i >= 0; i--) {
-            if ( max_snaps == 0  ) {
-                freeblock(blk);
-                goto done;
-            }
-            t = ctime(&blk->snaps[i].timestamp.tv_sec);
-            t[strlen(t)-1] = '\0';
-            //printf("%8Ld%4u%14lu.%06lu %12Ld %1s\n",
-            printf("%8Ld%4u%30s %06lu %12Ld %1s\n",
-                    sid.block, i, 
-                    //blk->snaps[i].timestamp.tv_sec,
-                    t,
-                    blk->snaps[i].timestamp.tv_usec,
-                    blk->snaps[i].radix_root,
-                    blk->snaps[i].deleted ? "*" : " ");
-            if ( max_snaps != -1 ) 
-                max_snaps--;
-        }
-        sid = blk->hdr.parent_block;
-        freeblock(blk);
-    }
-done:            
-    return 0;
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_tree.c
--- a/tools/blktap/vdi_tree.c   Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,132 +0,0 @@
-/**************************************************************************
- * 
- * vdi_tree.c
- *
- * Output current vdi tree to dot and postscript.
- *
- */
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include "blockstore.h"
-#include "radix.h"
-#include "vdi.h"
-
-#define GRAPH_DOT_FILE "vdi.dot"
-#define GRAPH_PS_FILE  "vdi.ps"
-
-typedef struct sh_st {
-    snap_id_t     id;
-    struct sh_st *next;
-} sh_t;
-
-#define SNAP_HASHSZ 1024
-sh_t *node_hash[SNAP_HASHSZ];
-#define SNAP_HASH(_id) (((int)(_id)->block^(_id)->index)%SNAP_HASHSZ)
-
-#define SNAPID_EQUAL(_a,_b) \
-    (((_a)->block==(_b)->block) && ((_a)->index==(_b)->index))
-int sh_check_and_add(snap_id_t *id)
-{
-    sh_t **s = &node_hash[SNAP_HASH(id)];
-    
-    while (*s != NULL) {
-        if (SNAPID_EQUAL(&((*s)->id), id))
-            return 1;
-        *s = (*s)->next;
-    }
-    
-    *s = (sh_t *)malloc(sizeof(sh_t));
-    (*s)->id = *id;
-    (*s)->next = NULL;
-    
-    return 0;
-}
-
-int main(int argc, char *argv[])
-{
-    FILE *f;
-    char dot_file[255] = GRAPH_DOT_FILE;
-    char  ps_file[255] = GRAPH_PS_FILE;
-    int nr_vdis = 0, nr_forks = 0;
-    vdi_registry_t *reg;
-    vdi_t *vdi;
-    int i;
-    
-    __init_blockstore();
-    __init_vdi();
-    
-    reg = get_vdi_registry();
-    
-    if ( reg == NULL ) {
-        printf("couldn't get VDI registry.\n");
-        exit(-1);
-    }
-    
-    if ( argc > 1 ) {
-        strncpy(ps_file, argv[1], 255);
-        ps_file[255] = '\0';
-    }
-    
-    /* now dump it out to a dot file. */
-    printf("[o] Dumping state to a dot graph. (%d VDIs)\n", nr_vdis);
-    
-    f = fopen(dot_file, "w");
-    
-    /* write graph preamble */
-    fprintf(f, "digraph G {\n");
-    fprintf(f, "   rankdir=LR\n");
-    
-    for (i=0; i<reg->nr_vdis; i++) {
-        char oldnode[255];
-        snap_block_t *blk;
-        snap_id_t id;
-        int nr_snaps, done=0;
-        
-        vdi = vdi_get(i);
-        id = vdi->snap;
-        /* add a node for the id */
-printf("vdi: %d\n", i);
-        fprintf(f, "   n%Ld%d 
[color=blue,shape=box,label=\"%s\\nb:%Ld\\nidx:%d\"]\n", 
-                id.block, id.index, vdi->name,
-                id.block, id.index);
-        sprintf(oldnode, "n%Ld%d", id.block, id.index);
-        
-        while (id.block != 0) {
-            blk = snap_get_block(id.block);
-            nr_snaps = blk->hdr.log_entries - (blk->hdr.nr_entries - id.index);
-            id = blk->hdr.fork_block;
-            
-            done = sh_check_and_add(&id);
-            
-            /* add a node for the fork_id */
-            if (!done) {
-                fprintf(f, "   n%Ld%d [shape=box,label=\"b:%Ld\\nidx:%d\"]\n", 
-                    id.block, id.index,
-                    id.block, id.index);
-            }
-            
-            /* add an edge between them */
-            fprintf(f, "   n%Ld%d -> %s [label=\"%u snapshots\"]\n",
-                    id.block, id.index, oldnode, nr_snaps);
-            sprintf(oldnode, "n%Ld%d", id.block, id.index);
-            freeblock(blk);
-            
-            if (done) break;
-        }
-    }
-    
-    /* write graph postamble */
-    fprintf(f, "}\n");
-    fclose(f);
-    
-    printf("[o] Generating postscript graph. (%s)\n", GRAPH_PS_FILE);
-    {
-        char cmd[255];
-        sprintf(cmd, "dot %s -Tps -o %s", dot_file, ps_file);
-        system(cmd);
-    }
-    return 0;
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_unittest.c
--- a/tools/blktap/vdi_unittest.c       Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,184 +0,0 @@
-/**************************************************************************
- * 
- * vdi_unittest.c
- *
- * Run a small test workload to ensure that data access through a vdi
- * is (at least superficially) correct.
- *
- */
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include "requests-async.h"
-#include "blockstore.h"
-#include "radix.h"
-#include "vdi.h"
-
-#define TEST_PAGES  32
-static char *zero_page;
-static char pages[TEST_PAGES][BLOCK_SIZE];
-static int next_page = 0;
-
-void fill_test_pages(void)
-{
-    int i, j;
-    long *page;
-
-    for (i=0; i< TEST_PAGES; i++) {
-        page = (unsigned long *)pages[i];
-        for (j=0; j<(BLOCK_SIZE/4); j++) {
-            page[j] = random();
-        }
-    }
-
-    zero_page = newblock();
-}
-
-inline u64 make_vaddr(u64 L1, u64 L2, u64 L3)
-{
-    u64 ret = L1;
-
-    ret = (ret << 9) | L2;
-    ret = (ret << 9) | L3;
-
-    return ret;
-}
-
-void touch_block(vdi_t *vdi, u64 L1, u64 L2, u64 L3)
-{
-    u64 vaddr;
-    char *page = pages[next_page++];
-    char *rpage = NULL;
-
-    printf("TOUCH (%3Lu, %3Lu, %3Lu)\n", L1, L2, L3);
-
-    vaddr = make_vaddr(L1, L2, L3);
-    vdi_write_s(vdi, vaddr, page);
-    rpage = vdi_read_s(vdi, vaddr);
-
-    if (rpage == NULL) 
-    {
-        printf( "read %Lu returned NULL\n", vaddr); 
-        return; 
-    }
-
-    if (memcmp(page, rpage, BLOCK_SIZE) != 0)
-    {
-        printf( "read %Lu returned a different page\n", vaddr);
-        return;
-    }
-
-    freeblock(rpage);
-}
-
-void test_block(vdi_t *vdi, u64 L1, u64 L2, u64 L3, char *page)
-{
-    u64 vaddr;
-    char *rpage = NULL;
-
-    printf("TEST  (%3Lu, %3Lu, %3Lu)\n", L1, L2, L3);
-
-    vaddr = make_vaddr(L1, L2, L3);
-    rpage = vdi_read_s(vdi, vaddr);
-
-    if (rpage == NULL) 
-    {
-        printf( "read %Lu returned NULL\n", vaddr); 
-        return; 
-    }
-
-    if (memcmp(page, rpage, BLOCK_SIZE) != 0)
-    {
-        printf( "read %Lu returned a different page\n", vaddr);
-        return;
-    }
-
-    freeblock(rpage);
-}
-
-void coverage_test(vdi_t *vdi)
-{
-    u64 vaddr;
-    int i, j, k;
-
-    /* Do a series of writes and reads to test all paths through the 
-     * async radix code.  The radix request code will dump CRC warnings
-     * if there are data problems here as well.
-     */
-
-    /* L1 Zero */
-    touch_block(vdi, 0, 0, 0);
-
-    /* L2 Zero */
-    i = next_page;
-    touch_block(vdi, 0, 1, 0);
-
-    /* L3 Zero */
-    j = next_page;
-    touch_block(vdi, 0, 0, 1);
-    k = next_page;
-    touch_block(vdi, 0, 1, 1);
-
-    /* Direct write */
-    touch_block(vdi, 0, 0, 0);
-
-    vdi_snapshot(vdi);
-
-    /* L1 fault */
-    touch_block(vdi, 0, 0, 0);
-    /* test the read-only branches that should have been copied over. */
-    test_block(vdi, 0, 1, 0, pages[i]);
-    test_block(vdi, 0, 0, 1, pages[j]);
-
-    /* L2 fault */
-    touch_block(vdi, 0, 1, 0);
-    test_block(vdi, 0, 1, 1, pages[k]);
-
-    /* L3 fault */
-    touch_block(vdi, 0, 0, 1);
-    
-    /* read - L1 zero */
-    test_block(vdi, 1, 0, 0, zero_page);
-    
-    /* read - L2 zero */
-    test_block(vdi, 0, 2, 0, zero_page);
-
-    /* read - L3 zero */
-    test_block(vdi, 0, 0, 2, zero_page);
-}
-
-int main(int argc, char *argv[])
-{
-    vdi_t       *vdi;
-    u64          id;
-    int          fd;
-    struct stat  st;
-    u64          tot_size;
-    char         spage[BLOCK_SIZE];
-    char        *dpage;
-    u64          vblock = 0, count=0;
-    
-    __init_blockstore();
-    init_block_async();
-    __init_vdi();
-        
-    vdi = vdi_create( NULL, "UNIT TEST VDI");
-    
-    if ( vdi == NULL ) {
-        printf("Failed to create VDI!\n");
-        freeblock(vdi);
-        exit(-1);
-    }
-
-    fill_test_pages();
-    coverage_test(vdi);
-    
-    freeblock(vdi);
-    
-    return (0);
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_validate.c
--- a/tools/blktap/vdi_validate.c       Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,97 +0,0 @@
-/**************************************************************************
- * 
- * vdi_validate.c
- *
- * Intended to sanity-check vm_fill and the underlying vdi code.
- *
- * Block-by-block compare of a vdi with a file/device on the disk.
- *
- */
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include "blockstore.h"
-#include "radix.h"
-#include "vdi.h"
-#include "requests-async.h"
-
-int main(int argc, char *argv[])
-{
-    vdi_t       *vdi;
-    u64          id;
-    int          fd;
-    struct stat  st;
-    u64          tot_size;
-    char         spage[BLOCK_SIZE], *dpage;
-    char        *vpage;
-    u64          vblock = 0, count=0;
-    
-    __init_blockstore();
-    init_block_async();
-    __init_vdi();
-    
-    if ( argc < 3 ) {
-        printf("usage: %s <VDI id> <filename>\n", argv[0]);
-        exit(-1);
-    }
-        
-    id = (u64) atoll(argv[1]);
-    
-    vdi = vdi_get( id );
-    
-    if ( vdi == NULL ) {
-        printf("Failed to retreive VDI %Ld!\n", id);
-        exit(-1);
-    }
-    
-    fd = open(argv[2], O_RDONLY | O_LARGEFILE);
-    
-    if (fd < 0) {
-        printf("Couldn't open %s!\n", argv[2]);
-        exit(-1);
-    }
-    
-    if ( fstat(fd, &st) != 0 ) {
-        printf("Couldn't stat %s!\n", argv[2]);
-        exit(-1);
-    }
-    
-    tot_size = (u64) st.st_size;
-    printf("Testing VDI %Ld (%Ld bytes).\n", id, tot_size);
-    
-    printf("           ");
-    while ( ( count = read(fd, spage, BLOCK_SIZE) ) > 0 ) {
-
-        dpage = vdi_read_s(vdi, vblock);
-
-        if (dpage == NULL) {
-            printf("\n\nfound an unmapped VDI block (%Ld)\n", vblock);
-            exit(0);
-        }
-
-        if (memcmp(spage, dpage, BLOCK_SIZE) != 0) {
-            printf("\n\nblocks don't match! (%Ld)\n", vblock);
-            exit(0);
-        }
-        
-        freeblock(dpage);
-        
-        vblock++;
-        if ((vblock % 1024) == 0) {
-            printf("\b\b\b\b\b\b\b\b\b\b\b%011Ld", vblock);
-            fflush(stdout);
-        }
-    }
-    printf("\n");
-    
-    printf("VDI %Ld looks good!\n", id);
-    
-    freeblock(vdi);
-    
-    return (0);
-}

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>