# HG changeset patch
# User akw27@xxxxxxxxxxxxxxxxxxxxxx
# Node ID abbc1d071e22967f0445de795c05c327b2dce888
# Parent 99ff7c3435b2145fb6ef4bb8e340a25a762e0afd
Clean-up of blktap and parallax user space code.
Move parallax stuff to its own sub directory and tidy Makefiles a bit.
Signed-off-by: andrew.warfield@xxxxxxxxxxxx
Signed-off-by: Nguyen Anh Quynh <aquynh@xxxxxxxxx>
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/Makefile
--- a/tools/blktap/Makefile Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/Makefile Sun Jul 3 14:14:09 2005
@@ -2,43 +2,24 @@
MINOR = 0
SONAME = libblktap.so.$(MAJOR)
-CC = gcc
-
XEN_ROOT = ../..
include $(XEN_ROOT)/tools/Rules.mk
-BLKTAP_INSTALL_DIR = /usr/sbin
+SUBDIRS :=
+SUBDIRS += parallax
-INSTALL = install
-INSTALL_PROG = $(INSTALL) -m0755
-INSTALL_DIR = $(INSTALL) -d -m0755
+BLKTAP_INSTALL_DIR = /usr/sbin
-INCLUDES +=
+INSTALL = install
+INSTALL_PROG = $(INSTALL) -m0755
+INSTALL_DIR = $(INSTALL) -d -m0755
+
+INCLUDES += -I. -I $(XEN_LIBXC)
LIBS := -lpthread -lz
SRCS :=
SRCS += blktaplib.c
-
-PLX_SRCS :=
-PLX_SRCS += vdi.c
-PLX_SRCS += radix.c
-PLX_SRCS += snaplog.c
-PLX_SRCS += blockstore.c
-PLX_SRCS += block-async.c
-PLX_SRCS += requests-async.c
-VDI_SRCS := $(PLX_SRCS)
-PLX_SRCS += parallax.c
-
-VDI_TOOLS :=
-VDI_TOOLS += vdi_create
-VDI_TOOLS += vdi_list
-VDI_TOOLS += vdi_snap
-VDI_TOOLS += vdi_snap_list
-VDI_TOOLS += vdi_snap_delete
-VDI_TOOLS += vdi_fill
-VDI_TOOLS += vdi_tree
-VDI_TOOLS += vdi_validate
CFLAGS += -Wall
CFLAGS += -Werror
@@ -46,20 +27,21 @@
#CFLAGS += -O3
CFLAGS += -g3
CFLAGS += -fno-strict-aliasing
-CFLAGS += -I $(XEN_LIBXC)
-CFLAGS += $(INCLUDES) -I.
CFLAGS += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
# Get gcc to generate the dependencies for us.
CFLAGS += -Wp,-MD,.$(@F).d
+CFLAGS += $(INCLUDES)
DEPS = .*.d
OBJS = $(patsubst %.c,%.o,$(SRCS))
-IBINS = blkdump parallax $(VDI_TOOLS)
+IBINS = blkdump
LIB = libblktap.so libblktap.so.$(MAJOR) libblktap.so.$(MAJOR).$(MINOR)
-all: mk-symlinks blkdump $(VDI_TOOLS) parallax blockstored
- $(MAKE) $(LIB)
+all: mk-symlinks libblktap.so blkdump
+ @set -e; for subdir in $(SUBDIRS); do \
+ $(MAKE) -C $$subdir $@; \
+ done
LINUX_ROOT := $(wildcard $(XEN_ROOT)/linux-2.6.*-xen-sparse)
mk-symlinks:
@@ -77,10 +59,16 @@
$(INSTALL_DIR) -p $(DESTDIR)/usr/include
$(INSTALL_PROG) $(LIB) $(DESTDIR)/usr/$(LIBDIR)
$(INSTALL_PROG) blktaplib.h $(DESTDIR)/usr/include
- $(INSTALL_PROG) $(IBINS) $(DESTDIR)/$(BLKTAP_INSTALL_DIR)
+ $(INSTALL_PROG) $(IBINS) $(DESTDIR)$(BLKTAP_INSTALL_DIR)
+ @set -e; for subdir in $(SUBDIRS); do \
+ $(MAKE) -C $$subdir $@; \
+ done
clean:
- rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS blkdump
$(VDI_TOOLS) parallax vdi_unittest
+ rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS blkdump
+ @set -e; for subdir in $(SUBDIRS); do \
+ $(MAKE) -C $$subdir $@; \
+ done
rpm: all
rm -rf staging
@@ -91,52 +79,17 @@
mv staging/i386/*.rpm .
rm -rf staging
-libblktap.so:
+libblktap.so: $(OBJS)
+ $(CC) $(CFLAGS) -Wl,-soname -Wl,$(SONAME) -shared -o \
+ libblktap.so.$(MAJOR).$(MINOR) $^ $(LIBS)
+ ln -sf libblktap.so.$(MAJOR).$(MINOR) libblktap.so.$(MAJOR)
ln -sf libblktap.so.$(MAJOR) $@
-libblktap.so.$(MAJOR):
- ln -sf libblktap.so.$(MAJOR).$(MINOR) $@
-libblktap.so.$(MAJOR).$(MINOR): $(OBJS)
- $(CC) -Wl,-soname -Wl,$(SONAME) -shared -o $@ $^ $(LIBS)
-blkdump: $(LIB)
+blkdump: libblktap.so
$(CC) $(CFLAGS) -o blkdump -L$(XEN_LIBXC) -L. -l blktap blkdump.c
-parallax: $(LIB) $(PLX_SRCS)
- $(CC) $(CFLAGS) -o parallax -L$(XEN_LIBXC) -L. -lblktap $(LIBS)
$(PLX_SRCS)
+.PHONY: TAGS clean install mk-symlinks rpm
-vdi_list: $(LIB) vdi_list.c $(VDI_SRCS)
- $(CC) $(CFLAGS) -g3 -o vdi_list vdi_list.c $(LIBS) $(VDI_SRCS)
-
-vdi_create: $(LIB) vdi_create.c $(VDI_SRCS)
- $(CC) $(CFLAGS) -g3 -o vdi_create vdi_create.c $(LIBS) $(VDI_SRCS)
-
-vdi_snap: $(LIB) vdi_snap.c $(VDI_SRCS)
- $(CC) $(CFLAGS) -g3 -o vdi_snap vdi_snap.c $(LIBS) $(VDI_SRCS)
-
-vdi_snap_list: $(LIB) vdi_snap_list.c $(VDI_SRCS)
- $(CC) $(CFLAGS) -g3 -o vdi_snap_list vdi_snap_list.c $(LIBS) $(VDI_SRCS)
-
-vdi_snap_delete: $(LIB) vdi_snap_delete.c $(VDI_SRCS)
- $(CC) $(CFLAGS) -g3 -o vdi_snap_delete vdi_snap_delete.c $(LIBS)
$(VDI_SRCS)
-
-vdi_tree: $(LIB) vdi_tree.c $(VDI_SRCS)
- $(CC) $(CFLAGS) -g3 -o vdi_tree vdi_tree.c $(LIBS) $(VDI_SRCS)
-
-vdi_fill: $(LIB) vdi_fill.c $(VDI_SRCS)
- $(CC) $(CFLAGS) -g3 -o vdi_fill vdi_fill.c $(LIBS) $(VDI_SRCS)
-
-vdi_validate: $(LIB) vdi_validate.c $(VDI_SRCS)
- $(CC) $(CFLAGS) -g3 -o vdi_validate vdi_validate.c $(LIBS) $(VDI_SRCS)
-
-vdi_unittest: $(LIB) vdi_unittest.c $(VDI_SRCS)
- $(CC) $(CFLAGS) -g3 -o vdi_unittest vdi_unittest.c $(LIBS) $(VDI_SRCS)
-
-blockstored: blockstored.c
- $(CC) $(CFLAGS) -g3 -o blockstored $(LIBS) blockstored.c
-bstest: bstest.c blockstore.c
- $(CC) $(CFLAGS) -g3 -o bstest bstest.c $(LIBS) blockstore.c
-
-.PHONY: TAGS clean install mk-symlinks rpm
TAGS:
etags -t $(SRCS) *.h
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/vdi_tree.c
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/vdi_tree.c Sun Jul 3 14:14:09 2005
@@ -0,0 +1,132 @@
+/**************************************************************************
+ *
+ * vdi_tree.c
+ *
+ * Output current vdi tree to dot and postscript.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+#define GRAPH_DOT_FILE "vdi.dot"
+#define GRAPH_PS_FILE "vdi.ps"
+
+typedef struct sh_st {
+ snap_id_t id;
+ struct sh_st *next;
+} sh_t;
+
+#define SNAP_HASHSZ 1024
+sh_t *node_hash[SNAP_HASHSZ];
+#define SNAP_HASH(_id) (((int)(_id)->block^(_id)->index)%SNAP_HASHSZ)
+
+#define SNAPID_EQUAL(_a,_b) \
+ (((_a)->block==(_b)->block) && ((_a)->index==(_b)->index))
+int sh_check_and_add(snap_id_t *id)
+{
+ sh_t **s = &node_hash[SNAP_HASH(id)];
+
+ while (*s != NULL) {
+ if (SNAPID_EQUAL(&((*s)->id), id))
+ return 1;
+ *s = (*s)->next;
+ }
+
+ *s = (sh_t *)malloc(sizeof(sh_t));
+ (*s)->id = *id;
+ (*s)->next = NULL;
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ FILE *f;
+ char dot_file[255] = GRAPH_DOT_FILE;
+ char ps_file[255] = GRAPH_PS_FILE;
+ int nr_vdis = 0, nr_forks = 0;
+ vdi_registry_t *reg;
+ vdi_t *vdi;
+ int i;
+
+ __init_blockstore();
+ __init_vdi();
+
+ reg = get_vdi_registry();
+
+ if ( reg == NULL ) {
+ printf("couldn't get VDI registry.\n");
+ exit(-1);
+ }
+
+ if ( argc > 1 ) {
+ strncpy(ps_file, argv[1], 255);
+ ps_file[255] = '\0';
+ }
+
+ /* now dump it out to a dot file. */
+ printf("[o] Dumping state to a dot graph. (%d VDIs)\n", nr_vdis);
+
+ f = fopen(dot_file, "w");
+
+ /* write graph preamble */
+ fprintf(f, "digraph G {\n");
+ fprintf(f, " rankdir=LR\n");
+
+ for (i=0; i<reg->nr_vdis; i++) {
+ char oldnode[255];
+ snap_block_t *blk;
+ snap_id_t id;
+ int nr_snaps, done=0;
+
+ vdi = vdi_get(i);
+ id = vdi->snap;
+ /* add a node for the id */
+printf("vdi: %d\n", i);
+ fprintf(f, " n%Ld%d
[color=blue,shape=box,label=\"%s\\nb:%Ld\\nidx:%d\"]\n",
+ id.block, id.index, vdi->name,
+ id.block, id.index);
+ sprintf(oldnode, "n%Ld%d", id.block, id.index);
+
+ while (id.block != 0) {
+ blk = snap_get_block(id.block);
+ nr_snaps = blk->hdr.log_entries - (blk->hdr.nr_entries - id.index);
+ id = blk->hdr.fork_block;
+
+ done = sh_check_and_add(&id);
+
+ /* add a node for the fork_id */
+ if (!done) {
+ fprintf(f, " n%Ld%d [shape=box,label=\"b:%Ld\\nidx:%d\"]\n",
+ id.block, id.index,
+ id.block, id.index);
+ }
+
+ /* add an edge between them */
+ fprintf(f, " n%Ld%d -> %s [label=\"%u snapshots\"]\n",
+ id.block, id.index, oldnode, nr_snaps);
+ sprintf(oldnode, "n%Ld%d", id.block, id.index);
+ freeblock(blk);
+
+ if (done) break;
+ }
+ }
+
+ /* write graph postamble */
+ fprintf(f, "}\n");
+ fclose(f);
+
+ printf("[o] Generating postscript graph. (%s)\n", GRAPH_PS_FILE);
+ {
+ char cmd[255];
+ sprintf(cmd, "dot %s -Tps -o %s", dot_file, ps_file);
+ system(cmd);
+ }
+ return 0;
+}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/snaplog.c
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/snaplog.c Sun Jul 3 14:14:09 2005
@@ -0,0 +1,238 @@
+/**************************************************************************
+ *
+ * snaplog.c
+ *
+ * Snapshot log on-disk data structure.
+ *
+ */
+
+ /* VDI histories are made from chains of snapshot logs. These logs record
+ * the (radix) root and timestamp of individual snapshots.
+ *
+ * creation of a new VDI involves 'forking' a snapshot log, by creating a
+ * new, empty log (in a new VDI) and parenting it off of a record in an
+ * existing snapshot log.
+ *
+ * snapshot log blocks have at most one writer.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "snaplog.h"
+
+
+
+snap_block_t *snap_get_block(u64 block)
+{
+ snap_block_t *blk = (snap_block_t *)readblock(block);
+
+ if ( blk == NULL)
+ return NULL;
+ if ( blk->hdr.magic != SNAP_MAGIC ) {
+ freeblock(blk);
+ return NULL;
+ }
+
+ return blk;
+}
+
+int snap_get_id(snap_id_t *id, snap_rec_t *target)
+{
+ snap_block_t *blk;
+
+ if ( id == NULL )
+ return -1;
+
+ blk = snap_get_block(id->block);
+
+ if ( blk == NULL )
+ return -1;
+
+ if ( id->index > blk->hdr.nr_entries ) {
+ freeblock(blk);
+ return -1;
+ }
+
+ *target = blk->snaps[id->index];
+ freeblock(blk);
+ return 0;
+}
+
+int __snap_block_create(snap_id_t *parent_id, snap_id_t *fork_id,
+ snap_id_t *new_id)
+{
+ snap_rec_t parent_rec, fork_rec;
+ snap_block_t *blk, *pblk;
+ /*
+ if ( (parent_id != NULL) && (snap_get_id(parent_id, &parent_rec) != 0) )
+ return -1;
+
+ if ( (fork_id != NULL) && (snap_get_id(fork_id, &fork_rec) != 0) )
+ return -1;
+*/
+ blk = (snap_block_t *)newblock();
+ blk->hdr.magic = SNAP_MAGIC;
+ blk->hdr.nr_entries = 0;
+ blk->hdr.log_entries = 0;
+ blk->hdr.immutable = 0;
+
+ if ( (parent_id != NULL)
+ && (parent_id->block != fork_id->block)
+ && (parent_id->block != 0)) {
+
+ pblk = snap_get_block(parent_id->block);
+ blk->hdr.log_entries = pblk->hdr.log_entries;
+ freeblock(pblk);
+ }
+
+ if (parent_id != NULL) {
+ blk->hdr.parent_block = *parent_id;
+ blk->hdr.fork_block = *fork_id;
+ } else {
+ blk->hdr.parent_block = null_snap_id;
+ blk->hdr.fork_block = null_snap_id;
+ }
+
+ new_id->index = 0;
+ new_id->block = allocblock(blk);
+ freeblock(blk);
+ if (new_id->block == 0)
+ return -1;
+
+ return 0;
+}
+
+int snap_block_create(snap_id_t *parent_id, snap_id_t *new_id)
+{
+ return __snap_block_create(parent_id, parent_id, new_id);
+}
+
+int snap_append(snap_id_t *old_id, snap_rec_t *rec, snap_id_t *new_id)
+{
+ snap_id_t id = *old_id;
+ snap_block_t *blk = snap_get_block(id.block);
+
+ if ( rec->deleted == 1 ) {
+ printf("Attempt to append a deleted snapshot!\n");
+ return -1;
+ }
+
+ if ( blk->hdr.immutable != 0 ) {
+ printf("Attempt to snap an immutable snap block!\n");
+ return -1;
+ }
+
+ new_id->block = id.block;
+
+ if (blk->hdr.nr_entries == SNAPS_PER_BLOCK) {
+ int ret;
+
+ id.index--; /* make id point to the last full record */
+
+ ret = __snap_block_create(&id, &blk->hdr.fork_block, new_id);
+ if ( ret != 0 ) {
+ freeblock(blk);
+ return -1;
+ }
+
+ blk->hdr.immutable = 1;
+ writeblock(id.block, blk);
+ freeblock(blk);
+ blk = snap_get_block(new_id->block);
+ id = *new_id;
+ }
+
+ blk->snaps[blk->hdr.nr_entries] = *rec;
+ blk->hdr.nr_entries++;
+ blk->hdr.log_entries++;
+ new_id->index = blk->hdr.nr_entries;
+ //printf("snap: %u %u\n", blk->hdr.nr_entries, blk->hdr.log_entries);
+ writeblock(id.block, blk);
+ freeblock(blk);
+ return 0;
+}
+
+int snap_collapse(int height, snap_id_t *p_id, snap_id_t *c_id)
+{
+ snap_block_t *p_blk, *c_blk, *blk;
+ snap_rec_t *p_rec, *c_rec;
+ int ret = -1;
+
+ p_blk = snap_get_block(p_id->block);
+
+ if (p_blk == NULL) return(-1);
+
+ if (c_id->block == p_id->block)
+ {
+ c_blk = p_blk;
+ } else {
+ c_blk = snap_get_block(c_id->block);
+ }
+
+ if (p_blk == NULL) {
+ freeblock(p_blk);
+ return(-1);
+ }
+
+ /* parent and child must not be deleted. */
+ p_rec = &p_blk->snaps[p_id->index];
+ c_rec = &c_blk->snaps[c_id->index];
+ /*
+ if ( (p_rec->deleted == 1) || (c_rec->deleted == 1) ) {
+ printf("One of those snaps is already deleted.\n");
+ goto done;
+ }
+ */
+ /* first non-deleted thing in the log before child must be parent. */
+
+ /* XXX todo: text the range here for delete (and eventually fork) bits) */
+ /* for now, snaps must be consecutive, on the same log page: */
+
+ if ((p_id->block != c_id->block) || (p_id->index != c_id->index-1))
+ {
+ printf("Deleting non-consecutive snaps is not done yet.\n");
+ goto done;
+ }
+
+ /* mark parent as deleted XXX: may need to lock parent block here.*/
+ p_rec->deleted = 1;
+ writeblock(p_id->block, p_blk);
+
+ /* delete the parent */
+ printf("collapse(%Ld, %Ld)\n", p_rec->radix_root, c_rec->radix_root);
+ ret = collapse(height, p_rec->radix_root, c_rec->radix_root);
+
+ /* return the number of blocks reclaimed. */
+
+done:
+ if (c_blk != p_blk) freeblock(c_blk);
+ freeblock(p_blk);
+
+ return(ret);
+}
+
+void snap_print_history(snap_id_t *snap_id)
+{
+ snap_id_t id = *snap_id;
+ unsigned int idx = id.index;
+ snap_block_t *new_blk, *blk = snap_get_block(id.block);
+
+ while ( blk ) {
+ printf("[Snap block %Ld]:\n", id.block);
+ do {
+ printf(" %03u: root: %Ld ts: %ld.%ld\n", idx,
+ blk->snaps[idx].radix_root,
+ blk->snaps[idx].timestamp.tv_sec,
+ blk->snaps[idx].timestamp.tv_usec);
+ } while (idx-- != 0);
+
+ id = blk->hdr.parent_block;
+ if (id.block != 0) {
+ new_blk = snap_get_block(id.block);
+ }
+ freeblock(blk);
+ blk = new_blk;
+ }
+}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/snaplog.h
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/snaplog.h Sun Jul 3 14:14:09 2005
@@ -0,0 +1,61 @@
+/**************************************************************************
+ *
+ * snaplog.h
+ *
+ * Snapshot log on-disk data structure.
+ *
+ */
+
+#include "radix.h"
+#include "blockstore.h" /* for BLOCK_SIZE */
+
+#ifndef __SNAPLOG_H__
+#define __SNAPLOG_H__
+
+typedef struct snap_id {
+ u64 block;
+ unsigned int index;
+} snap_id_t;
+
+typedef struct snap_rec {
+ u64 radix_root;
+ struct timeval timestamp;
+ /* flags: */
+ unsigned deleted:1;
+} snap_rec_t;
+
+
+int snap_block_create(snap_id_t *parent_id, snap_id_t *new_id);
+int snap_append(snap_id_t *id, snap_rec_t *rec, snap_id_t *new_id);
+int snap_collapse(int height, snap_id_t *p_id, snap_id_t *c_id);
+void snap_print_history(snap_id_t *snap_id);
+int snap_get_id(snap_id_t *id, snap_rec_t *target);
+
+
+/* exported for vdi debugging */
+#define SNAP_MAGIC 0xff00ff0aa0ff00ffLL
+
+static const snap_id_t null_snap_id = { 0, 0 };
+
+typedef struct snap_block_hdr {
+ u64 magic;
+ snap_id_t parent_block; /* parent block within this chain */
+ snap_id_t fork_block; /* where this log was forked */
+ unsigned log_entries; /* total entries since forking */
+ unsigned short nr_entries; /* entries in snaps[] */
+ unsigned short immutable; /* has this snap page become immutable? */
+} snap_block_hdr_t;
+
+
+#define SNAPS_PER_BLOCK \
+ ((BLOCK_SIZE - sizeof(snap_block_hdr_t)) / sizeof(snap_rec_t))
+
+typedef struct snap_block {
+ snap_block_hdr_t hdr;
+ snap_rec_t snaps[SNAPS_PER_BLOCK];
+} snap_block_t;
+
+
+snap_block_t *snap_get_block(u64 block);
+
+#endif /* __SNAPLOG_H__ */
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/README
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/README Sun Jul 3 14:14:09 2005
@@ -0,0 +1,177 @@
+Parallax Quick Overview
+March 3, 2005
+
+This is intended to provide a quick set of instructions to let you
+guys play with the current parallax source. In it's current form, the
+code will let you run an arbitrary number of VMs off of a single disk
+image, doing copy-on-write as they make updates. Each domain is
+assigned a virtual disk image (VDI), which may be based on a snapshot
+of an existing image. All of the VDI and snapshot management should
+currently work.
+
+The current implementation uses a single file as a blockstore for
+_everything_ this will soon be replaced by the fancier backend code
+and the local cache. As it stands, Parallax will create
+"blockstore.dat" in the directory that you run it from, and use
+largefile support to make this grow to unfathomable girth. So, you
+probably want to run the daemon off of a local disk, with a lot of
+free space.
+
+Here's how to get going:
+
+0. Setup:
+---------
+
+Pick a local directory on a disk with lots of room. You should be
+running from a privileged domain (e.g. dom0) with the blocktap
+configured in and block backend NOT.
+
+For convenience (for the moment) copy all of the vdi tools (vdi_*) and
+the parallax daemon from tools/blktap into this directory.
+
+1. Populate the blockstore:
+---------------------------
+
+First you need to put at least one image into the blockstore. You
+will need a disk image, either as a file or local partition. My
+general approach has been to
+
+(a) make a really big sparse file with
+
+ dd if=/dev/zero of=./image bs=4K count=1 seek=[big value]
+
+(b) put a filesystem into it
+
+ mkfs.ext3 ./image
+
+(c) mount it using loopback
+
+ mkdir ./mnt
+ mount -o loop ./image
+
+(d) cd into it and untar one of the image files from srg-roots.
+
+ cd mnt
+ tar ...
+
+NOTE: Beware if your system is FC3. mkfs is not compatible with old
+versions of fedora, and so you don't have much choice but to install
+further fc3 images if you have used the fc3 version of mkfs.
+
+(e) unmount the image
+
+ cd ..
+ umount mnt
+
+(f) now, create a new VDI to hold the image
+
+ ./vdi_create "My new FC3 VDI"
+
+(g) get the id of the new VDI.
+
+ ./vdi_list
+
+ | 0 My new FC3 VDI
+
+(0 is the VDI id... create a few more if you want.)
+
+(h) hoover your image into the new VDI.
+
+ ./vdi_fill 0 ./image
+
+This will pull the entire image into the blockstore and set up a
+mapping tree for it for VDI 0. Passing a device (i.e. /dev/sda3)
+should also work, but vdi_fill has NO notion of sparseness yet, so you
+are going to pump a block into the store for each block you read.
+
+vdi_fill will count up until it is done, and you should be ready to
+go. If you want to be anal, you can use vdi_validate to test the VDI
+against the original image.
+
+2. Create some extra VDIs
+-------------------------
+
+VDIs are actually a list of snapshots, and each snapshot is a full
+image of mappings. So, to preserve an immutable copy of a current
+VDI, do this:
+
+(a) Snapshot your new VDI.
+
+ ./vdi_snap 0
+
+Snapshotting writes the current radix root to the VDI's snapshot log,
+and assigns it a new writable root.
+
+(b) look at the VDI's snapshot log.
+
+ ./vdi_snap_list 0
+
+ | 16 0 Thu Mar 3 19:27:48 2005 565111 31
+
+The first two columns constitute a snapshot id and represent the
+(block, offset) of the snapshot record. The Date tells you when the
+snapshot was made, and 31 is the radix root node of the snapshot.
+
+(c) Create a new VDI, based on that snapshot, and look at the list.
+
+ ./vdi_create "FC3 - Copy 1" 16 0
+ ./vdi_list
+
+ | 0 My new FC3 VDI
+ | 1 FC3 - Copy 1
+
+NOTE: If you have Graphviz installed on your system, you can use
+vdi_tree to generate a postscript of your current set of VDIs and
+snapshots.
+
+
+Create as many VDIs as you need for the VMs that you want to run.
+
+3. Boot some VMs:
+-----------------
+
+Parallax currently uses a hack in xend to pass the VDI id, you need to
+modify the disk line of the VM config that is going to mount it.
+
+(a) set up your vm config, by using the following disk line:
+
+ disk = ['parallax:1,sda1,w,0' ]
+
+This example uses VDI 1 (from vdi_list above), presents it as sda1
+(writable), and uses dom 0 as the backend. If you were running the
+daemon (and tap driver) in some domain other than 0, you would change
+this last parameter.
+
+NOTE: You'll need to have reinstalled xend/tools prior to booting the vm, so
that it knows what to do with "parallax:".
+
+(b) Run parallax in the backend domain.
+
+ ./parallax
+
+(c) create your new domain.
+
+ xm create ...
+
+---
+
+That's pretty much all there is to it at the moment. Hope this is
+clear enough to get you going. Now, a few serious caveats that will
+be sorted out in the almost immediate future:
+
+WARNINGS:
+---------
+
+1. There is NO locking in the VDI tools at the moment, so I'd avoid
+running them in parallel, or more importantly, running them while the
+daemon is running.
+
+2. I doubt that xend will be very happy about restarting if you have
+parallax-using domains. So if it dies while there are active parallax
+doms, you may need to reboot.
+
+3. I've turned off write-in-place. So at the moment, EVERY block
+write is a log append on the blockstore. I've been having some probs
+with the radix tree's marking of writable blocks after snapshots and
+will sort this out very soon.
+
+
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/bstest.c
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/bstest.c Sun Jul 3 14:14:09 2005
@@ -0,0 +1,191 @@
+/**************************************************************************
+ *
+ * bstest.c
+ *
+ * Block store daemon test program.
+ *
+ * usage: bstest <host>|X {r|w|a} ID
+ *
+ */
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <errno.h>
+#include "blockstore.h"
+
+int direct(char *host, u32 op, u64 id, int len) {
+ struct sockaddr_in sn, peer;
+ int sock;
+ bsmsg_t msgbuf;
+ int rc, slen;
+ struct hostent *addr;
+
+ addr = gethostbyname(host);
+ if (!addr) {
+ perror("bad hostname");
+ exit(1);
+ }
+ peer.sin_family = addr->h_addrtype;
+ peer.sin_port = htons(BLOCKSTORED_PORT);
+ peer.sin_addr.s_addr = ((struct in_addr *)(addr->h_addr))->s_addr;
+ fprintf(stderr, "Sending to: %u.%u.%u.%u\n",
+ (unsigned int)(unsigned char)addr->h_addr[0],
+ (unsigned int)(unsigned char)addr->h_addr[1],
+ (unsigned int)(unsigned char)addr->h_addr[2],
+ (unsigned int)(unsigned char)addr->h_addr[3]);
+
+ sock = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock < 0) {
+ perror("Bad socket");
+ exit(1);
+ }
+ memset(&sn, 0, sizeof(sn));
+ sn.sin_family = AF_INET;
+ sn.sin_port = htons(BLOCKSTORED_PORT);
+ sn.sin_addr.s_addr = htonl(INADDR_ANY);
+ if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) {
+ perror("bind");
+ close(sock);
+ exit(1);
+ }
+
+ memset((void *)&msgbuf, 0, sizeof(msgbuf));
+ msgbuf.operation = op;
+ msgbuf.id = id;
+
+ rc = sendto(sock, (void *)&msgbuf, len, 0,
+ (struct sockaddr *)&peer, sizeof(peer));
+ if (rc < 0) {
+ perror("sendto");
+ exit(1);
+ }
+
+ slen = sizeof(peer);
+ len = recvfrom(sock, (void *)&msgbuf, sizeof(msgbuf), 0,
+ (struct sockaddr *)&peer, &slen);
+ if (len < 0) {
+ perror("recvfrom");
+ exit(1);
+ }
+
+ printf("Reply %u bytes:\n", len);
+ if (len >= MSGBUFSIZE_OP)
+ printf(" operation: %u\n", msgbuf.operation);
+ if (len >= MSGBUFSIZE_FLAGS)
+ printf(" flags: 0x%x\n", msgbuf.flags);
+ if (len >= MSGBUFSIZE_ID)
+ printf(" id: %llu\n", msgbuf.id);
+ if (len >= (MSGBUFSIZE_ID + 4))
+ printf(" data: %02x %02x %02x %02x...\n",
+ (unsigned int)msgbuf.block[0],
+ (unsigned int)msgbuf.block[1],
+ (unsigned int)msgbuf.block[2],
+ (unsigned int)msgbuf.block[3]);
+
+ if (sock > 0)
+ close(sock);
+
+ return 0;
+}
+
+int main (int argc, char **argv) {
+
+ u32 op = 0;
+ u64 id = 0;
+ int len = 0, rc;
+ void *block;
+
+ if (argc < 3) {
+ fprintf(stderr, "usage: bstest <host>|X {r|w|a} ID\n");
+ return 1;
+ }
+
+ switch (argv[2][0]) {
+ case 'r':
+ case 'R':
+ op = BSOP_READBLOCK;
+ len = MSGBUFSIZE_ID;
+ break;
+ case 'w':
+ case 'W':
+ op = BSOP_WRITEBLOCK;
+ len = MSGBUFSIZE_BLOCK;
+ break;
+ case 'a':
+ case 'A':
+ op = BSOP_ALLOCBLOCK;
+ len = MSGBUFSIZE_BLOCK;
+ break;
+ default:
+ fprintf(stderr, "Unknown action '%s'.\n", argv[2]);
+ return 1;
+ }
+
+ if (argc >= 4)
+ id = atoll(argv[3]);
+
+ if (strcmp(argv[1], "X") == 0) {
+ rc = __init_blockstore();
+ if (rc < 0) {
+ fprintf(stderr, "blockstore init failed.\n");
+ return 1;
+ }
+ switch(op) {
+ case BSOP_READBLOCK:
+ block = readblock(id);
+ if (block) {
+ printf("data: %02x %02x %02x %02x...\n",
+ (unsigned int)((unsigned char*)block)[0],
+ (unsigned int)((unsigned char*)block)[1],
+ (unsigned int)((unsigned char*)block)[2],
+ (unsigned int)((unsigned char*)block)[3]);
+ }
+ break;
+ case BSOP_WRITEBLOCK:
+ block = malloc(BLOCK_SIZE);
+ if (!block) {
+ perror("bstest malloc");
+ return 1;
+ }
+ memset(block, 0, BLOCK_SIZE);
+ rc = writeblock(id, block);
+ if (rc != 0) {
+ printf("error\n");
+ }
+ else {
+ printf("OK\n");
+ }
+ break;
+ case BSOP_ALLOCBLOCK:
+ block = malloc(BLOCK_SIZE);
+ if (!block) {
+ perror("bstest malloc");
+ return 1;
+ }
+ memset(block, 0, BLOCK_SIZE);
+ id = allocblock_hint(block, id);
+ if (id == 0) {
+ printf("error\n");
+ }
+ else {
+ printf("ID: %llu\n", id);
+ }
+ break;
+ }
+ }
+ else {
+ direct(argv[1], op, id, len);
+ }
+
+
+ return 0;
+}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/vdi_snap_delete.c
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/vdi_snap_delete.c Sun Jul 3 14:14:09 2005
@@ -0,0 +1,48 @@
+/**************************************************************************
+ *
+ * vdi_snap_delete.c
+ *
+ * Delete a snapshot.
+ *
+ * This is not finished: right now it takes a snap n and calls
+ * snap_collapse(n,n+1).
+ *
+ * TODO: support for non-consecutive, non-same-block snaps
+ * Avoid forking probs.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "snaplog.h"
+#include "radix.h"
+#include "vdi.h"
+
+int main(int argc, char *argv[])
+{
+ snap_id_t id, c_id;
+ int ret;
+
+ __init_blockstore();
+ __init_vdi();
+
+ if ( argc != 3 ) {
+ printf("usage: %s <snap block> <snap idx>\n", argv[0]);
+ exit(-1);
+ }
+
+ id.block = (u64) atoll(argv[1]);
+ id.index = (unsigned int) atol (argv[2]);
+
+ c_id = id;
+ c_id.index++;
+
+ ret = snap_collapse(VDI_HEIGHT, &id, &c_id);
+
+ printf("Freed %d blocks.\n", ret);
+
+ return 0;
+}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/block-async.c
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/block-async.c Sun Jul 3 14:14:09 2005
@@ -0,0 +1,393 @@
+/* block-async.c
+ *
+ * Asynchronous block wrappers for parallax.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include "block-async.h"
+#include "blockstore.h"
+#include "vdi.h"
+
+
+#if 0
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+/* We have a queue of outstanding I/O requests implemented as a
+ * circular producer-consumer ring with free-running buffers.
+ * to allow reordering, this ring indirects to indexes in an
+ * ring of io_structs.
+ *
+ * the block_* calls may either add an entry to this ring and return,
+ * or satisfy the request immediately and call the callback directly.
+ * None of the io calls in parallax should be nested enough to worry
+ * about stack problems with this approach.
+ */
+
+struct read_args {
+ u64 addr;
+};
+
+struct write_args {
+ u64 addr;
+ char *block;
+};
+
+struct alloc_args {
+ char *block;
+};
+
+struct pending_io_req {
+ enum {IO_READ, IO_WRITE, IO_ALLOC, IO_RWAKE, IO_WWAKE} op;
+ union {
+ struct read_args r;
+ struct write_args w;
+ struct alloc_args a;
+ } u;
+ io_cb_t cb;
+ void *param;
+};
+
+void radix_lock_init(struct radix_lock *r)
+{
+ int i;
+
+ pthread_mutex_init(&r->lock, NULL);
+ for (i=0; i < 1024; i++) {
+ r->lines[i] = 0;
+ r->waiters[i] = NULL;
+ r->state[i] = ANY;
+ }
+}
+
+/* maximum outstanding I/O requests issued asynchronously */
+/* must be a power of 2.*/
+#define MAX_PENDING_IO 1024
+
+/* how many threads to concurrently issue I/O to the disk. */
+#define IO_POOL_SIZE 10
+
+static struct pending_io_req pending_io_reqs[MAX_PENDING_IO];
+static int pending_io_list[MAX_PENDING_IO];
+static unsigned long io_prod = 0, io_cons = 0, io_free = 0;
+#define PENDING_IO_MASK(_x) ((_x) & (MAX_PENDING_IO - 1))
+#define PENDING_IO_IDX(_x) ((_x) - pending_io_reqs)
+#define PENDING_IO_ENT(_x) \
+ (&pending_io_reqs[pending_io_list[PENDING_IO_MASK(_x)]])
+#define CAN_PRODUCE_PENDING_IO ((io_free + MAX_PENDING_IO) != io_prod)
+#define CAN_CONSUME_PENDING_IO (io_cons != io_prod)
+static pthread_mutex_t pending_io_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t pending_io_cond = PTHREAD_COND_INITIALIZER;
+
+static void init_pending_io(void)
+{
+ int i;
+
+ for (i=0; i<MAX_PENDING_IO; i++)
+ pending_io_list[i] = i;
+
+}
+
+void block_read(u64 addr, io_cb_t cb, void *param)
+{
+ struct pending_io_req *req;
+
+ pthread_mutex_lock(&pending_io_lock);
+ assert(CAN_PRODUCE_PENDING_IO);
+
+ req = PENDING_IO_ENT(io_prod++);
+ DPRINTF("Produce (R) %lu (%p)\n", io_prod - 1, req);
+ req->op = IO_READ;
+ req->u.r.addr = addr;
+ req->cb = cb;
+ req->param = param;
+
+ pthread_cond_signal(&pending_io_cond);
+ pthread_mutex_unlock(&pending_io_lock);
+}
+
+
+void block_write(u64 addr, char *block, io_cb_t cb, void *param)
+{
+ struct pending_io_req *req;
+
+ pthread_mutex_lock(&pending_io_lock);
+ assert(CAN_PRODUCE_PENDING_IO);
+
+ req = PENDING_IO_ENT(io_prod++);
+ DPRINTF("Produce (W) %lu (%p)\n", io_prod - 1, req);
+ req->op = IO_WRITE;
+ req->u.w.addr = addr;
+ req->u.w.block = block;
+ req->cb = cb;
+ req->param = param;
+
+ pthread_cond_signal(&pending_io_cond);
+ pthread_mutex_unlock(&pending_io_lock);
+}
+
+
+void block_alloc(char *block, io_cb_t cb, void *param)
+{
+ struct pending_io_req *req;
+
+ pthread_mutex_lock(&pending_io_lock);
+ assert(CAN_PRODUCE_PENDING_IO);
+
+ req = PENDING_IO_ENT(io_prod++);
+ req->op = IO_ALLOC;
+ req->u.a.block = block;
+ req->cb = cb;
+ req->param = param;
+
+ pthread_cond_signal(&pending_io_cond);
+ pthread_mutex_unlock(&pending_io_lock);
+}
+
+void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
+{
+ struct io_ret ret;
+ pthread_mutex_lock(&r->lock);
+
+ if (( r->lines[row] >= 0 ) && (r->state[row] != STOP)) {
+ r->lines[row]++;
+ r->state[row] = READ;
+ DPRINTF("RLOCK : %3d (row: %d)\n", r->lines[row], row);
+ pthread_mutex_unlock(&r->lock);
+ ret.type = IO_INT_T;
+ ret.u.i = 0;
+ cb(ret, param);
+ } else {
+ struct radix_wait **rwc;
+ struct radix_wait *rw =
+ (struct radix_wait *) malloc (sizeof(struct radix_wait));
+ DPRINTF("RLOCK : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row);
+ rw->type = RLOCK;
+ rw->param = param;
+ rw->cb = cb;
+ rw->next = NULL;
+ /* append to waiters list. */
+ rwc = &r->waiters[row];
+ while (*rwc != NULL) rwc = &(*rwc)->next;
+ *rwc = rw;
+ pthread_mutex_unlock(&r->lock);
+ return;
+ }
+}
+
+
+void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
+{
+ struct io_ret ret;
+ pthread_mutex_lock(&r->lock);
+
+ /* the second check here is redundant -- just here for debugging now. */
+ if ((r->state[row] == ANY) && ( r->lines[row] == 0 )) {
+ r->state[row] = STOP;
+ r->lines[row] = -1;
+ DPRINTF("WLOCK : %3d (row: %d)\n", r->lines[row], row);
+ pthread_mutex_unlock(&r->lock);
+ ret.type = IO_INT_T;
+ ret.u.i = 0;
+ cb(ret, param);
+ } else {
+ struct radix_wait **rwc;
+ struct radix_wait *rw =
+ (struct radix_wait *) malloc (sizeof(struct radix_wait));
+ DPRINTF("WLOCK : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row);
+ rw->type = WLOCK;
+ rw->param = param;
+ rw->cb = cb;
+ rw->next = NULL;
+ /* append to waiters list. */
+ rwc = &r->waiters[row];
+ while (*rwc != NULL) rwc = &(*rwc)->next;
+ *rwc = rw;
+ pthread_mutex_unlock(&r->lock);
+ return;
+ }
+
+}
+
+/* called with radix_lock locked and lock count of zero. */
+static void wake_waiters(struct radix_lock *r, int row)
+{
+ struct pending_io_req *req;
+ struct radix_wait *rw;
+
+ if (r->lines[row] != 0) return;
+ if (r->waiters[row] == NULL) return;
+
+ if (r->waiters[row]->type == WLOCK) {
+
+ rw = r->waiters[row];
+ pthread_mutex_lock(&pending_io_lock);
+ assert(CAN_PRODUCE_PENDING_IO);
+
+ req = PENDING_IO_ENT(io_prod++);
+ req->op = IO_WWAKE;
+ req->cb = rw->cb;
+ req->param = rw->param;
+ r->lines[row] = -1; /* write lock the row. */
+ r->state[row] = STOP;
+ r->waiters[row] = rw->next;
+ free(rw);
+ pthread_mutex_unlock(&pending_io_lock);
+
+ } else /* RLOCK */ {
+
+ while ((r->waiters[row] != NULL) && (r->waiters[row]->type == RLOCK)) {
+ rw = r->waiters[row];
+ pthread_mutex_lock(&pending_io_lock);
+ assert(CAN_PRODUCE_PENDING_IO);
+
+ req = PENDING_IO_ENT(io_prod++);
+ req->op = IO_RWAKE;
+ req->cb = rw->cb;
+ req->param = rw->param;
+ r->lines[row]++; /* read lock the row. */
+ r->state[row] = READ;
+ r->waiters[row] = rw->next;
+ free(rw);
+ pthread_mutex_unlock(&pending_io_lock);
+ }
+
+ if (r->waiters[row] != NULL) /* There is a write queued still */
+ r->state[row] = STOP;
+ }
+
+ pthread_mutex_lock(&pending_io_lock);
+ pthread_cond_signal(&pending_io_cond);
+ pthread_mutex_unlock(&pending_io_lock);
+}
+
+void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
+{
+ struct io_ret ret;
+
+ pthread_mutex_lock(&r->lock);
+ assert(r->lines[row] > 0); /* try to catch misuse. */
+ r->lines[row]--;
+ if (r->lines[row] == 0) {
+ r->state[row] = ANY;
+ wake_waiters(r, row);
+ }
+ pthread_mutex_unlock(&r->lock);
+ cb(ret, param);
+}
+
+void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
+{
+ struct io_ret ret;
+
+ pthread_mutex_lock(&r->lock);
+ assert(r->lines[row] == -1); /* try to catch misuse. */
+ r->lines[row] = 0;
+ r->state[row] = ANY;
+ wake_waiters(r, row);
+ pthread_mutex_unlock(&r->lock);
+ cb(ret, param);
+}
+
+/* consumer calls */
+static void do_next_io_req(struct pending_io_req *req)
+{
+ struct io_ret ret;
+ void *param;
+
+ switch (req->op) {
+ case IO_READ:
+ ret.type = IO_BLOCK_T;
+ ret.u.b = readblock(req->u.r.addr);
+ break;
+ case IO_WRITE:
+ ret.type = IO_INT_T;
+ ret.u.i = writeblock(req->u.w.addr, req->u.w.block);
+ DPRINTF("wrote %d at %Lu\n", *(int *)(req->u.w.block), req->u.w.addr);
+ break;
+ case IO_ALLOC:
+ ret.type = IO_ADDR_T;
+ ret.u.a = allocblock(req->u.a.block);
+ break;
+ case IO_RWAKE:
+ DPRINTF("WAKE DEFERRED RLOCK!\n");
+ ret.type = IO_INT_T;
+ ret.u.i = 0;
+ break;
+ case IO_WWAKE:
+ DPRINTF("WAKE DEFERRED WLOCK!\n");
+ ret.type = IO_INT_T;
+ ret.u.i = 0;
+ break;
+ default:
+ DPRINTF("Unknown IO operation on pending list!\n");
+ return;
+ }
+
+ param = req->param;
+ pthread_mutex_lock(&pending_io_lock);
+ pending_io_list[PENDING_IO_MASK(io_free++)] = PENDING_IO_IDX(req);
+ pthread_mutex_unlock(&pending_io_lock);
+
+ assert(req->cb != NULL);
+ req->cb(ret, param);
+
+}
+
+void *io_thread(void *param)
+{
+ int tid;
+ struct pending_io_req *req;
+
+ /* Set this thread's tid. */
+ tid = *(int *)param;
+ free(param);
+
+start:
+ pthread_mutex_lock(&pending_io_lock);
+ while (io_prod == io_cons) {
+ pthread_cond_wait(&pending_io_cond, &pending_io_lock);
+ }
+
+ if (io_prod == io_cons) {
+ /* unnecessary wakeup. */
+ pthread_mutex_unlock(&pending_io_lock);
+ goto start;
+ }
+
+ req = PENDING_IO_ENT(io_cons++);
+ pthread_mutex_unlock(&pending_io_lock);
+
+ do_next_io_req(req);
+
+ goto start;
+
+}
+
+static pthread_t io_pool[IO_POOL_SIZE];
+void start_io_threads(void)
+
+{
+ int i, tid=0;
+
+ for (i=0; i < IO_POOL_SIZE; i++) {
+ int ret, *t;
+ t = (int *)malloc(sizeof(int));
+ *t = tid++;
+ ret = pthread_create(&io_pool[i], NULL, io_thread, t);
+ if (ret != 0) printf("Error starting thread %d\n", i);
+ }
+
+}
+
+void init_block_async(void)
+{
+ init_pending_io();
+ start_io_threads();
+}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/vdi_snap_list.c
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/vdi_snap_list.c Sun Jul 3 14:14:09 2005
@@ -0,0 +1,82 @@
+/**************************************************************************
+ *
+ * vdi_snap_list.c
+ *
+ * Print a list of snapshots for the specified vdi.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+int main(int argc, char *argv[])
+{
+ vdi_t *vdi;
+ u64 id;
+ int i, max_snaps = -1;
+ snap_block_t *blk;
+ snap_id_t sid;
+ char *t;
+
+ __init_blockstore();
+ __init_vdi();
+
+ if ( argc == 1 ) {
+ printf("usage: %s <VDI id> [max snaps]\n", argv[0]);
+ exit(-1);
+ }
+
+ id = (u64) atoll(argv[1]);
+
+ if ( argc > 2 ) {
+ max_snaps = atoi(argv[2]);
+ }
+
+ vdi = vdi_get(id);
+
+ if ( vdi == NULL ) {
+ printf("couldn't find the requested VDI.\n");
+ freeblock(vdi);
+ exit(-1);
+ }
+
+ sid = vdi->snap;
+ sid.index--;
+
+ //printf("%8s%4s%21s %12s %1s\n", "Block", "idx", "timestamp",
+ // "radix root", "d");
+ printf("%8s%4s%37s %12s %1s\n", "Block", "idx", "timestamp",
+ "radix root", "d");
+
+ while (sid.block != 0) {
+ blk = snap_get_block(sid.block);
+ for (i = sid.index; i >= 0; i--) {
+ if ( max_snaps == 0 ) {
+ freeblock(blk);
+ goto done;
+ }
+ t = ctime(&blk->snaps[i].timestamp.tv_sec);
+ t[strlen(t)-1] = '\0';
+ //printf("%8Ld%4u%14lu.%06lu %12Ld %1s\n",
+ printf("%8Ld%4u%30s %06lu %12Ld %1s\n",
+ sid.block, i,
+ //blk->snaps[i].timestamp.tv_sec,
+ t,
+ blk->snaps[i].timestamp.tv_usec,
+ blk->snaps[i].radix_root,
+ blk->snaps[i].deleted ? "*" : " ");
+ if ( max_snaps != -1 )
+ max_snaps--;
+ }
+ sid = blk->hdr.parent_block;
+ freeblock(blk);
+ }
+done:
+ return 0;
+}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/vdi_list.c
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/vdi_list.c Sun Jul 3 14:14:09 2005
@@ -0,0 +1,47 @@
+/**************************************************************************
+ *
+ * vdi_list.c
+ *
+ * Print a list of VDIs on the block store.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+int main(int argc, char *argv[])
+{
+ vdi_registry_t *reg;
+ vdi_t *vdi;
+ int i;
+
+ __init_blockstore();
+ __init_vdi();
+
+ reg = get_vdi_registry();
+
+ if ( reg == NULL ) {
+ printf("couldn't get VDI registry.\n");
+ exit(-1);
+ }
+
+ for (i=0; i < reg->nr_vdis; i++) {
+ vdi = vdi_get(i);
+
+ if ( vdi != NULL ) {
+
+ printf("%10Ld %60s\n", vdi->id, vdi->name);
+ freeblock(vdi);
+
+ }
+ }
+
+ freeblock(reg);
+
+ return 0;
+}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/blockstore.c
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/blockstore.c Sun Jul 3 14:14:09 2005
@@ -0,0 +1,1350 @@
+/**************************************************************************
+ *
+ * blockstore.c
+ *
+ * Simple block store interface
+ *
+ */
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <stdarg.h>
+#include "blockstore.h"
+#include <pthread.h>
+
+//#define BLOCKSTORE_REMOTE
+//#define BSDEBUG
+
+#define RETRY_TIMEOUT 1000000 /* microseconds */
+
+/*****************************************************************************
+ * Debugging
+ */
+#ifdef BSDEBUG
+void DB(char *format, ...)
+{
+ va_list args;
+ fprintf(stderr, "[%05u] ", (int)pthread_getspecific(tid_key));
+ va_start(args, format);
+ vfprintf(stderr, format, args);
+ va_end(args);
+}
+#else
+#define DB(format, ...) (void)0
+#endif
+
+#ifdef BLOCKSTORE_REMOTE
+
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <netinet/in.h>
+#include <netdb.h>
+
+/*****************************************************************************
+ * Network state *
+ *****************************************************************************/
+
+/* The individual disk servers we talks to. These will be referenced by
+ * an integer index into bsservers[].
+ */
+bsserver_t bsservers[MAX_SERVERS];
+
+/* The cluster map. This is indexed by an integer cluster number.
+ */
+bscluster_t bsclusters[MAX_CLUSTERS];
+
+/* Local socket.
+ */
+struct sockaddr_in sin_local;
+int bssock = 0;
+
+/*****************************************************************************
+ * Notification *
+ *****************************************************************************/
+
+typedef struct pool_thread_t_struct {
+ pthread_mutex_t ptmutex;
+ pthread_cond_t ptcv;
+ int newdata;
+} pool_thread_t;
+
+pool_thread_t pool_thread[READ_POOL_SIZE+1];
+
+#define RECV_NOTIFY(tid) { \
+ pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \
+ pool_thread[tid].newdata = 1; \
+ DB("CV Waking %u", tid); \
+ pthread_cond_signal(&(pool_thread[tid].ptcv)); \
+ pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); }
+#define RECV_AWAIT(tid) { \
+ pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \
+ if (pool_thread[tid].newdata) { \
+ pool_thread[tid].newdata = 0; \
+ DB("CV Woken %u", tid); \
+ } \
+ else { \
+ DB("CV Waiting %u", tid); \
+ pthread_cond_wait(&(pool_thread[tid].ptcv), \
+ &(pool_thread[tid].ptmutex)); \
+ } \
+ pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); }
+
+/*****************************************************************************
+ * Message queue management *
+ *****************************************************************************/
+
+/* Protects the queue manipulation critcal regions.
+ */
+pthread_mutex_t ptmutex_queue;
+#define ENTER_QUEUE_CR pthread_mutex_lock(&ptmutex_queue)
+#define LEAVE_QUEUE_CR pthread_mutex_unlock(&ptmutex_queue)
+
+pthread_mutex_t ptmutex_recv;
+#define ENTER_RECV_CR pthread_mutex_lock(&ptmutex_recv)
+#define LEAVE_RECV_CR pthread_mutex_unlock(&ptmutex_recv)
+
+/* A message queue entry. We allocate one of these for every request we send.
+ * Asynchronous reply reception also used one of these.
+ */
+typedef struct bsq_t_struct {
+ struct bsq_t_struct *prev;
+ struct bsq_t_struct *next;
+ int status;
+ int server;
+ int length;
+ struct msghdr msghdr;
+ struct iovec iov[2];
+ int tid;
+ struct timeval tv_sent;
+ bshdr_t message;
+ void *block;
+} bsq_t;
+
+#define BSQ_STATUS_MATCHED 1
+
+pthread_mutex_t ptmutex_luid;
+#define ENTER_LUID_CR pthread_mutex_lock(&ptmutex_luid)
+#define LEAVE_LUID_CR pthread_mutex_unlock(&ptmutex_luid)
+
+static u64 luid_cnt = 0x1000ULL;
+u64 new_luid(void) {
+ u64 luid;
+ ENTER_LUID_CR;
+ luid = luid_cnt++;
+ LEAVE_LUID_CR;
+ return luid;
+}
+
+/* Queue of outstanding requests.
+ */
+bsq_t *bs_head = NULL;
+bsq_t *bs_tail = NULL;
+int bs_qlen = 0;
+
+/*
+ */
+void queuedebug(char *msg) {
+ bsq_t *q;
+ ENTER_QUEUE_CR;
+ fprintf(stderr, "Q: %s len=%u\n", msg, bs_qlen);
+ for (q = bs_head; q; q = q->next) {
+ fprintf(stderr, " luid=%016llx server=%u\n",
+ q->message.luid, q->server);
+ }
+ LEAVE_QUEUE_CR;
+}
+
+int enqueue(bsq_t *qe) {
+ ENTER_QUEUE_CR;
+ qe->next = NULL;
+ qe->prev = bs_tail;
+ if (!bs_head)
+ bs_head = qe;
+ else
+ bs_tail->next = qe;
+ bs_tail = qe;
+ bs_qlen++;
+ LEAVE_QUEUE_CR;
+#ifdef BSDEBUG
+ queuedebug("enqueue");
+#endif
+ return 0;
+}
+
+int dequeue(bsq_t *qe) {
+ bsq_t *q;
+ ENTER_QUEUE_CR;
+ for (q = bs_head; q; q = q->next) {
+ if (q == qe) {
+ if (q->prev)
+ q->prev->next = q->next;
+ else
+ bs_head = q->next;
+ if (q->next)
+ q->next->prev = q->prev;
+ else
+ bs_tail = q->prev;
+ bs_qlen--;
+ goto found;
+ }
+ }
+
+ LEAVE_QUEUE_CR;
+#ifdef BSDEBUG
+ queuedebug("dequeue not found");
+#endif
+ return 0;
+
+ found:
+ LEAVE_QUEUE_CR;
+#ifdef BSDEBUG
+ queuedebug("dequeue not found");
+#endif
+ return 1;
+}
+
+bsq_t *queuesearch(bsq_t *qe) {
+ bsq_t *q;
+ ENTER_QUEUE_CR;
+ for (q = bs_head; q; q = q->next) {
+ if ((qe->server == q->server) &&
+ (qe->message.operation == q->message.operation) &&
+ (qe->message.luid == q->message.luid)) {
+
+ if ((q->message.operation == BSOP_READBLOCK) &&
+ ((q->message.flags & BSOP_FLAG_ERROR) == 0)) {
+ q->block = qe->block;
+ qe->block = NULL;
+ }
+ q->length = qe->length;
+ q->message.flags = qe->message.flags;
+ q->message.id = qe->message.id;
+ q->status |= BSQ_STATUS_MATCHED;
+
+ if (q->prev)
+ q->prev->next = q->next;
+ else
+ bs_head = q->next;
+ if (q->next)
+ q->next->prev = q->prev;
+ else
+ bs_tail = q->prev;
+ q->next = NULL;
+ q->prev = NULL;
+ bs_qlen--;
+ goto found;
+ }
+ }
+
+ LEAVE_QUEUE_CR;
+#ifdef BSDEBUG
+ queuedebug("queuesearch not found");
+#endif
+ return NULL;
+
+ found:
+ LEAVE_QUEUE_CR;
+#ifdef BSDEBUG
+ queuedebug("queuesearch found");
+#endif
+ return q;
+}
+
+/*****************************************************************************
+ * Network communication *
+ *****************************************************************************/
+
+int send_message(bsq_t *qe) {
+ int rc;
+
+ qe->msghdr.msg_name = (void *)&(bsservers[qe->server].sin);
+ qe->msghdr.msg_namelen = sizeof(struct sockaddr_in);
+ qe->msghdr.msg_iov = qe->iov;
+ if (qe->block)
+ qe->msghdr.msg_iovlen = 2;
+ else
+ qe->msghdr.msg_iovlen = 1;
+ qe->msghdr.msg_control = NULL;
+ qe->msghdr.msg_controllen = 0;
+ qe->msghdr.msg_flags = 0;
+
+ qe->iov[0].iov_base = (void *)&(qe->message);
+ qe->iov[0].iov_len = MSGBUFSIZE_ID;
+
+ if (qe->block) {
+ qe->iov[1].iov_base = qe->block;
+ qe->iov[1].iov_len = BLOCK_SIZE;
+ }
+
+ qe->message.luid = new_luid();
+
+ qe->status = 0;
+ qe->tid = (int)pthread_getspecific(tid_key);
+ if (enqueue(qe) < 0) {
+ fprintf(stderr, "Error enqueuing request.\n");
+ return -1;
+ }
+
+ gettimeofday(&(qe->tv_sent), NULL);
+ DB("send_message to %d luid=%016llx\n", qe->server, qe->message.luid);
+ rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
+ //rc = sendto(bssock, (void *)&(qe->message), qe->length, 0,
+ // (struct sockaddr *)&(bsservers[qe->server].sin),
+ // sizeof(struct sockaddr_in));
+ if (rc < 0)
+ return rc;
+
+ return rc;
+}
+
+int recv_message(bsq_t *qe) {
+ struct sockaddr_in from;
+ //int flen = sizeof(from);
+ int rc;
+
+ qe->msghdr.msg_name = &from;
+ qe->msghdr.msg_namelen = sizeof(struct sockaddr_in);
+ qe->msghdr.msg_iov = qe->iov;
+ if (qe->block)
+ qe->msghdr.msg_iovlen = 2;
+ else
+ qe->msghdr.msg_iovlen = 1;
+ qe->msghdr.msg_control = NULL;
+ qe->msghdr.msg_controllen = 0;
+ qe->msghdr.msg_flags = 0;
+
+ qe->iov[0].iov_base = (void *)&(qe->message);
+ qe->iov[0].iov_len = MSGBUFSIZE_ID;
+ if (qe->block) {
+ qe->iov[1].iov_base = qe->block;
+ qe->iov[1].iov_len = BLOCK_SIZE;
+ }
+
+ rc = recvmsg(bssock, &(qe->msghdr), 0);
+
+ //return recvfrom(bssock, (void *)&(qe->message), sizeof(bsmsg_t), 0,
+ // (struct sockaddr *)&from, &flen);
+ return rc;
+}
+
+int get_server_number(struct sockaddr_in *sin) {
+ int i;
+
+#ifdef BSDEBUG2
+ fprintf(stderr,
+ "get_server_number(%u.%u.%u.%u/%u)\n",
+ (unsigned int)sin->sin_addr.s_addr & 0xff,
+ ((unsigned int)sin->sin_addr.s_addr >> 8) & 0xff,
+ ((unsigned int)sin->sin_addr.s_addr >> 16) & 0xff,
+ ((unsigned int)sin->sin_addr.s_addr >> 24) & 0xff,
+ (unsigned int)sin->sin_port);
+#endif
+
+ for (i = 0; i < MAX_SERVERS; i++) {
+ if (bsservers[i].hostname) {
+#ifdef BSDEBUG2
+ fprintf(stderr,
+ "get_server_number check %u.%u.%u.%u/%u\n",
+ (unsigned int)bsservers[i].sin.sin_addr.s_addr&0xff,
+ ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 8)&0xff,
+ ((unsigned int)bsservers[i].sin.sin_addr.s_addr >>
16)&0xff,
+ ((unsigned int)bsservers[i].sin.sin_addr.s_addr >>
24)&0xff,
+ (unsigned int)bsservers[i].sin.sin_port);
+#endif
+ if ((sin->sin_family == bsservers[i].sin.sin_family) &&
+ (sin->sin_port == bsservers[i].sin.sin_port) &&
+ (memcmp((void *)&(sin->sin_addr),
+ (void *)&(bsservers[i].sin.sin_addr),
+ sizeof(struct in_addr)) == 0)) {
+ return i;
+ }
+ }
+ }
+
+ return -1;
+}
+
+void *rx_buffer = NULL;
+bsq_t rx_qe;
+bsq_t *recv_any(void) {
+ struct sockaddr_in from;
+ int rc;
+
+ DB("ENTER recv_any\n");
+
+ rx_qe.msghdr.msg_name = &from;
+ rx_qe.msghdr.msg_namelen = sizeof(struct sockaddr_in);
+ rx_qe.msghdr.msg_iov = rx_qe.iov;
+ if (!rx_buffer) {
+ rx_buffer = malloc(BLOCK_SIZE);
+ if (!rx_buffer) {
+ perror("recv_any malloc");
+ return NULL;
+ }
+ }
+ rx_qe.block = rx_buffer;
+ rx_buffer = NULL;
+ rx_qe.msghdr.msg_iovlen = 2;
+ rx_qe.msghdr.msg_control = NULL;
+ rx_qe.msghdr.msg_controllen = 0;
+ rx_qe.msghdr.msg_flags = 0;
+
+ rx_qe.iov[0].iov_base = (void *)&(rx_qe.message);
+ rx_qe.iov[0].iov_len = MSGBUFSIZE_ID;
+ rx_qe.iov[1].iov_base = rx_qe.block;
+ rx_qe.iov[1].iov_len = BLOCK_SIZE;
+
+ rc = recvmsg(bssock, &(rx_qe.msghdr), 0);
+ if (rc < 0) {
+ perror("recv_any");
+ return NULL;
+ }
+
+ rx_qe.length = rc;
+ rx_qe.server = get_server_number(&from);
+
+ DB("recv_any from %d luid=%016llx len=%u\n",
+ rx_qe.server, rx_qe.message.luid, rx_qe.length);
+
+ return &rx_qe;
+}
+
+void recv_recycle_buffer(bsq_t *q) {
+ if (q->block) {
+ rx_buffer = q->block;
+ q->block = NULL;
+ }
+}
+
+// cycle through reading any incoming, searching for a match in the
+// queue, until we have all we need.
+int wait_recv(bsq_t **reqs, int numreqs) {
+ bsq_t *q, *m;
+ unsigned int x, i;
+ int tid = (int)pthread_getspecific(tid_key);
+
+ DB("ENTER wait_recv %u\n", numreqs);
+
+ checkmatch:
+ x = 0xffffffff;
+ for (i = 0; i < numreqs; i++) {
+ x &= reqs[i]->status;
+ }
+ if ((x & BSQ_STATUS_MATCHED)) {
+ DB("LEAVE wait_recv\n");
+ return numreqs;
+ }
+
+ RECV_AWAIT(tid);
+
+ /*
+ rxagain:
+ ENTER_RECV_CR;
+ q = recv_any();
+ LEAVE_RECV_CR;
+ if (!q)
+ return -1;
+
+ m = queuesearch(q);
+ recv_recycle_buffer(q);
+ if (!m) {
+ fprintf(stderr, "Unmatched RX\n");
+ goto rxagain;
+ }
+ */
+
+ goto checkmatch;
+
+}
+
+/* retry
+ */
+static int retry_count = 0;
+int retry(bsq_t *qe)
+{
+ int rc;
+ gettimeofday(&(qe->tv_sent), NULL);
+ DB("retry to %d luid=%016llx\n", qe->server, qe->message.luid);
+ retry_count++;
+ rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
+ if (rc < 0)
+ return rc;
+ return 0;
+}
+
+/* queue runner
+ */
+void *queue_runner(void *arg)
+{
+ for (;;) {
+ struct timeval now;
+ long long nowus, sus;
+ bsq_t *q;
+ int r;
+
+ sleep(1);
+
+ gettimeofday(&now, NULL);
+ nowus = now.tv_usec + now.tv_sec * 1000000;
+ ENTER_QUEUE_CR;
+ r = retry_count;
+ for (q = bs_head; q; q = q->next) {
+ sus = q->tv_sent.tv_usec + q->tv_sent.tv_sec * 1000000;
+ if ((nowus - sus) > RETRY_TIMEOUT) {
+ if (retry(q) < 0) {
+ fprintf(stderr, "Error on sendmsg retry.\n");
+ }
+ }
+ }
+ if (r != retry_count) {
+ fprintf(stderr, "RETRIES: %u %u\n", retry_count - r, retry_count);
+ }
+ LEAVE_QUEUE_CR;
+ }
+}
+
+/* receive loop
+ */
+void *receive_loop(void *arg)
+{
+ bsq_t *q, *m;
+
+ for(;;) {
+ q = recv_any();
+ if (!q) {
+ fprintf(stderr, "recv_any error\n");
+ }
+ else {
+ m = queuesearch(q);
+ recv_recycle_buffer(q);
+ if (!m) {
+ fprintf(stderr, "Unmatched RX\n");
+ }
+ else {
+ DB("RX MATCH");
+ RECV_NOTIFY(m->tid);
+ }
+ }
+ }
+}
+pthread_t pthread_recv;
+
+/*****************************************************************************
+ * Reading *
+ *****************************************************************************/
+
+void *readblock_indiv(int server, u64 id) {
+ void *block;
+ bsq_t *qe;
+ int len, rc;
+
+ qe = (bsq_t *)malloc(sizeof(bsq_t));
+ if (!qe) {
+ perror("readblock qe malloc");
+ return NULL;
+ }
+ qe->block = NULL;
+
+ /*
+ qe->block = malloc(BLOCK_SIZE);
+ if (!qe->block) {
+ perror("readblock qe malloc");
+ free((void *)qe);
+ return NULL;
+ }
+ */
+
+ qe->server = server;
+
+ qe->message.operation = BSOP_READBLOCK;
+ qe->message.flags = 0;
+ qe->message.id = id;
+ qe->length = MSGBUFSIZE_ID;
+
+ if (send_message(qe) < 0) {
+ perror("readblock sendto");
+ goto err;
+ }
+
+ /*len = recv_message(qe);
+ if (len < 0) {
+ perror("readblock recv");
+ goto err;
+ }*/
+
+ rc = wait_recv(&qe, 1);
+ if (rc < 0) {
+ perror("readblock recv");
+ goto err;
+ }
+
+ if ((qe->message.flags & BSOP_FLAG_ERROR)) {
+ fprintf(stderr, "readblock server error\n");
+ goto err;
+ }
+ if (qe->length < MSGBUFSIZE_BLOCK) {
+ fprintf(stderr, "readblock recv short (%u)\n", len);
+ goto err;
+ }
+ /* if ((block = malloc(BLOCK_SIZE)) == NULL) {
+ perror("readblock malloc");
+ goto err;
+ }
+ memcpy(block, qe->message.block, BLOCK_SIZE);
+ */
+ block = qe->block;
+
+ free((void *)qe);
+ return block;
+
+ err:
+ if (qe->block)
+ free(qe->block);
+ free((void *)qe);
+ return NULL;
+}
+
+/**
+ * readblock: read a block from disk
+ * @id: block id to read
+ *
+ * @return: pointer to block, NULL on error
+ */
+void *readblock(u64 id) {
+ int map = (int)BSID_MAP(id);
+ u64 xid;
+ static int i = CLUSTER_MAX_REPLICAS - 1;
+ void *block = NULL;
+
+ /* special case for the "superblock" just use the first block on the
+ * first replica. (extend to blocks < 6 for vdi bug)
+ */
+ if (id < 6) {
+ block = readblock_indiv(bsclusters[map].servers[0], id);
+ goto out;
+ }
+
+ i++;
+ if (i >= CLUSTER_MAX_REPLICAS)
+ i = 0;
+ switch (i) {
+ case 0:
+ xid = BSID_REPLICA0(id);
+ break;
+ case 1:
+ xid = BSID_REPLICA1(id);
+ break;
+ case 2:
+ xid = BSID_REPLICA2(id);
+ break;
+ }
+
+ block = readblock_indiv(bsclusters[map].servers[i], xid);
+
+ out:
+#ifdef BSDEBUG
+ if (block)
+ fprintf(stderr, "READ: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
+ id,
+ (unsigned int)((unsigned char *)block)[0],
+ (unsigned int)((unsigned char *)block)[1],
+ (unsigned int)((unsigned char *)block)[2],
+ (unsigned int)((unsigned char *)block)[3],
+ (unsigned int)((unsigned char *)block)[4],
+ (unsigned int)((unsigned char *)block)[5],
+ (unsigned int)((unsigned char *)block)[6],
+ (unsigned int)((unsigned char *)block)[7]);
+ else
+ fprintf(stderr, "READ: %016llx NULL\n", id);
+#endif
+ return block;
+}
+
+/*****************************************************************************
+ * Writing *
+ *****************************************************************************/
+
+bsq_t *writeblock_indiv(int server, u64 id, void *block) {
+
+ bsq_t *qe;
+ int len;
+
+ qe = (bsq_t *)malloc(sizeof(bsq_t));
+ if (!qe) {
+ perror("writeblock qe malloc");
+ goto err;
+ }
+ qe->server = server;
+
+ qe->message.operation = BSOP_WRITEBLOCK;
+ qe->message.flags = 0;
+ qe->message.id = id;
+ //memcpy(qe->message.block, block, BLOCK_SIZE);
+ qe->block = block;
+ qe->length = MSGBUFSIZE_BLOCK;
+
+ if (send_message(qe) < 0) {
+ perror("writeblock sendto");
+ goto err;
+ }
+
+ return qe;
+
+ err:
+ free((void *)qe);
+ return NULL;
+}
+
+
+/**
+ * writeblock: write an existing block to disk
+ * @id: block id
+ * @block: pointer to block
+ *
+ * @return: zero on success, -1 on failure
+ */
+int writeblock(u64 id, void *block) {
+
+ int map = (int)BSID_MAP(id);
+ int rep0 = bsclusters[map].servers[0];
+ int rep1 = bsclusters[map].servers[1];
+ int rep2 = bsclusters[map].servers[2];
+ bsq_t *reqs[3];
+ int rc;
+
+ reqs[0] = reqs[1] = reqs[2] = NULL;
+
+#ifdef BSDEBUG
+ fprintf(stderr,
+ "WRITE: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
+ id,
+ (unsigned int)((unsigned char *)block)[0],
+ (unsigned int)((unsigned char *)block)[1],
+ (unsigned int)((unsigned char *)block)[2],
+ (unsigned int)((unsigned char *)block)[3],
+ (unsigned int)((unsigned char *)block)[4],
+ (unsigned int)((unsigned char *)block)[5],
+ (unsigned int)((unsigned char *)block)[6],
+ (unsigned int)((unsigned char *)block)[7]);
+#endif
+
+ /* special case for the "superblock" just use the first block on the
+ * first replica. (extend to blocks < 6 for vdi bug)
+ */
+ if (id < 6) {
+ reqs[0] = writeblock_indiv(rep0, id, block);
+ if (!reqs[0])
+ return -1;
+ rc = wait_recv(reqs, 1);
+ return rc;
+ }
+
+ reqs[0] = writeblock_indiv(rep0, BSID_REPLICA0(id), block);
+ if (!reqs[0])
+ goto err;
+ reqs[1] = writeblock_indiv(rep1, BSID_REPLICA1(id), block);
+ if (!reqs[1])
+ goto err;
+ reqs[2] = writeblock_indiv(rep2, BSID_REPLICA2(id), block);
+ if (!reqs[2])
+ goto err;
+
+ rc = wait_recv(reqs, 3);
+ if (rc < 0) {
+ perror("writeblock recv");
+ goto err;
+ }
+ if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) {
+ fprintf(stderr, "writeblock server0 error\n");
+ goto err;
+ }
+ if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) {
+ fprintf(stderr, "writeblock server1 error\n");
+ goto err;
+ }
+ if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) {
+ fprintf(stderr, "writeblock server2 error\n");
+ goto err;
+ }
+
+
+ free((void *)reqs[0]);
+ free((void *)reqs[1]);
+ free((void *)reqs[2]);
+ return 0;
+
+ err:
+ if (reqs[0]) {
+ dequeue(reqs[0]);
+ free((void *)reqs[0]);
+ }
+ if (reqs[1]) {
+ dequeue(reqs[1]);
+ free((void *)reqs[1]);
+ }
+ if (reqs[2]) {
+ dequeue(reqs[2]);
+ free((void *)reqs[2]);
+ }
+ return -1;
+}
+
+/*****************************************************************************
+ * Allocation *
+ *****************************************************************************/
+
+/**
+ * allocblock: write a new block to disk
+ * @block: pointer to block
+ *
+ * @return: new id of block on disk
+ */
+u64 allocblock(void *block) {
+ return allocblock_hint(block, 0);
+}
+
+bsq_t *allocblock_hint_indiv(int server, void *block, u64 hint) {
+ bsq_t *qe;
+ int len;
+
+ qe = (bsq_t *)malloc(sizeof(bsq_t));
+ if (!qe) {
+ perror("allocblock_hint qe malloc");
+ goto err;
+ }
+ qe->server = server;
+
+ qe->message.operation = BSOP_ALLOCBLOCK;
+ qe->message.flags = 0;
+ qe->message.id = hint;
+ //memcpy(qe->message.block, block, BLOCK_SIZE);
+ qe->block = block;
+ qe->length = MSGBUFSIZE_BLOCK;
+
+ if (send_message(qe) < 0) {
+ perror("allocblock_hint sendto");
+ goto err;
+ }
+
+ return qe;
+
+ err:
+ free((void *)qe);
+ return NULL;
+}
+
+/**
+ * allocblock_hint: write a new block to disk
+ * @block: pointer to block
+ * @hint: allocation hint
+ *
+ * @return: new id of block on disk
+ */
+u64 allocblock_hint(void *block, u64 hint) {
+ int map = (int)hint;
+ int rep0 = bsclusters[map].servers[0];
+ int rep1 = bsclusters[map].servers[1];
+ int rep2 = bsclusters[map].servers[2];
+ bsq_t *reqs[3];
+ int rc;
+ u64 id0, id1, id2;
+
+ reqs[0] = reqs[1] = reqs[2] = NULL;
+
+ DB("ENTER allocblock\n");
+
+ reqs[0] = allocblock_hint_indiv(rep0, block, hint);
+ if (!reqs[0])
+ goto err;
+ reqs[1] = allocblock_hint_indiv(rep1, block, hint);
+ if (!reqs[1])
+ goto err;
+ reqs[2] = allocblock_hint_indiv(rep2, block, hint);
+ if (!reqs[2])
+ goto err;
+
+ rc = wait_recv(reqs, 3);
+ if (rc < 0) {
+ perror("allocblock recv");
+ goto err;
+ }
+ if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) {
+ fprintf(stderr, "allocblock server0 error\n");
+ goto err;
+ }
+ if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) {
+ fprintf(stderr, "allocblock server1 error\n");
+ goto err;
+ }
+ if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) {
+ fprintf(stderr, "allocblock server2 error\n");
+ goto err;
+ }
+
+ id0 = reqs[0]->message.id;
+ id1 = reqs[1]->message.id;
+ id2 = reqs[2]->message.id;
+
+#ifdef BSDEBUG
+ fprintf(stderr, "ALLOC: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
+ BSID(map, id0, id1, id2),
+ (unsigned int)((unsigned char *)block)[0],
+ (unsigned int)((unsigned char *)block)[1],
+ (unsigned int)((unsigned char *)block)[2],
+ (unsigned int)((unsigned char *)block)[3],
+ (unsigned int)((unsigned char *)block)[4],
+ (unsigned int)((unsigned char *)block)[5],
+ (unsigned int)((unsigned char *)block)[6],
+ (unsigned int)((unsigned char *)block)[7]);
+#endif
+
+ free((void *)reqs[0]);
+ free((void *)reqs[1]);
+ free((void *)reqs[2]);
+ return BSID(map, id0, id1, id2);
+
+ err:
+ if (reqs[0]) {
+ dequeue(reqs[0]);
+ free((void *)reqs[0]);
+ }
+ if (reqs[1]) {
+ dequeue(reqs[1]);
+ free((void *)reqs[1]);
+ }
+ if (reqs[2]) {
+ dequeue(reqs[2]);
+ free((void *)reqs[2]);
+ }
+ return 0;
+}
+
+#else /* /BLOCKSTORE_REMOTE */
+
+/*****************************************************************************
+ * Local storage version *
+ *****************************************************************************/
+
+/**
+ * readblock: read a block from disk
+ * @id: block id to read
+ *
+ * @return: pointer to block, NULL on error
+ */
+
+void *readblock(u64 id) {
+ void *block;
+ int block_fp;
+
+//printf("readblock(%llu)\n", id);
+ block_fp = open("blockstore.dat", O_RDONLY | O_CREAT | O_LARGEFILE, 0644);
+
+ if (block_fp < 0) {
+ perror("open");
+ return NULL;
+ }
+
+ if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
+ printf ("%Ld ", id);
+ printf ("%Ld\n", (id - 1) * BLOCK_SIZE);
+ perror("readblock lseek");
+ goto err;
+ }
+ if ((block = malloc(BLOCK_SIZE)) == NULL) {
+ perror("readblock malloc");
+ goto err;
+ }
+ if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
+ perror("readblock read");
+ free(block);
+ goto err;
+ }
+ close(block_fp);
+ return block;
+
+err:
+ close(block_fp);
+ return NULL;
+}
+
+/**
+ * writeblock: write an existing block to disk
+ * @id: block id
+ * @block: pointer to block
+ *
+ * @return: zero on success, -1 on failure
+ */
+int writeblock(u64 id, void *block) {
+
+ int block_fp;
+
+ block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
+
+ if (block_fp < 0) {
+ perror("open");
+ return -1;
+ }
+
+ if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
+ perror("writeblock lseek");
+ goto err;
+ }
+ if (write(block_fp, block, BLOCK_SIZE) < 0) {
+ perror("writeblock write");
+ goto err;
+ }
+ close(block_fp);
+ return 0;
+
+err:
+ close(block_fp);
+ return -1;
+}
+
+/**
+ * allocblock: write a new block to disk
+ * @block: pointer to block
+ *
+ * @return: new id of block on disk
+ */
+
+u64 allocblock(void *block) {
+ u64 lb;
+ off64_t pos;
+ int block_fp;
+
+ block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
+
+ if (block_fp < 0) {
+ perror("open");
+ return 0;
+ }
+
+ pos = lseek64(block_fp, 0, SEEK_END);
+ if (pos == (off64_t)-1) {
+ perror("allocblock lseek");
+ goto err;
+ }
+ if (pos % BLOCK_SIZE != 0) {
+ fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE);
+ goto err;
+ }
+ if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
+ perror("allocblock write");
+ goto err;
+ }
+ lb = pos / BLOCK_SIZE + 1;
+//printf("alloc(%Ld)\n", lb);
+ close(block_fp);
+ return lb;
+
+err:
+ close(block_fp);
+ return 0;
+
+}
+
+/**
+ * allocblock_hint: write a new block to disk
+ * @block: pointer to block
+ * @hint: allocation hint
+ *
+ * @return: new id of block on disk
+ */
+u64 allocblock_hint(void *block, u64 hint) {
+ return allocblock(block);
+}
+
+#endif /* BLOCKSTORE_REMOTE */
+
+/*****************************************************************************
+ * Memory management *
+ *****************************************************************************/
+
+/**
+ * newblock: get a new in-memory block set to zeros
+ *
+ * @return: pointer to new block, NULL on error
+ */
+void *newblock() {
+ void *block = malloc(BLOCK_SIZE);
+ if (block == NULL) {
+ perror("newblock");
+ return NULL;
+ }
+ memset(block, 0, BLOCK_SIZE);
+ return block;
+}
+
+
+/**
+ * freeblock: unallocate an in-memory block
+ * @id: block id (zero if this is only in-memory)
+ * @block: block to be freed
+ */
+void freeblock(void *block) {
+ if (block != NULL)
+ free(block);
+}
+
+static freeblock_t *new_freeblock(void)
+{
+ freeblock_t *fb;
+
+ fb = newblock();
+
+ if (fb == NULL) return NULL;
+
+ fb->magic = FREEBLOCK_MAGIC;
+ fb->next = 0ULL;
+ fb->count = 0ULL;
+ memset(fb->list, 0, sizeof fb->list);
+
+ return fb;
+}
+
+void releaseblock(u64 id)
+{
+ blockstore_super_t *bs_super;
+ freeblock_t *fl_current;
+
+ /* get superblock */
+ bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
+
+ /* get freeblock_current */
+ if (bs_super->freelist_current == 0ULL)
+ {
+ fl_current = new_freeblock();
+ bs_super->freelist_current = allocblock(fl_current);
+ writeblock(BLOCKSTORE_SUPER, bs_super);
+ } else {
+ fl_current = readblock(bs_super->freelist_current);
+ }
+
+ /* if full, chain to superblock and allocate new current */
+
+ if (fl_current->count == FREEBLOCK_SIZE) {
+ fl_current->next = bs_super->freelist_full;
+ writeblock(bs_super->freelist_current, fl_current);
+ bs_super->freelist_full = bs_super->freelist_current;
+ freeblock(fl_current);
+ fl_current = new_freeblock();
+ bs_super->freelist_current = allocblock(fl_current);
+ writeblock(BLOCKSTORE_SUPER, bs_super);
+ }
+
+ /* append id to current */
+ fl_current->list[fl_current->count++] = id;
+ writeblock(bs_super->freelist_current, fl_current);
+
+ freeblock(fl_current);
+ freeblock(bs_super);
+
+
+}
+
+/* freelist debug functions: */
+void freelist_count(int print_each)
+{
+ blockstore_super_t *bs_super;
+ freeblock_t *fb;
+ u64 total = 0, next;
+
+ bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
+
+ if (bs_super->freelist_current == 0ULL) {
+ printf("freelist is empty!\n");
+ return;
+ }
+
+ fb = readblock(bs_super->freelist_current);
+ printf("%Ld entires on current.\n", fb->count);
+ total += fb->count;
+ if (print_each == 1)
+ {
+ int i;
+ for (i=0; i< fb->count; i++)
+ printf(" %Ld\n", fb->list[i]);
+ }
+
+ freeblock(fb);
+
+ if (bs_super->freelist_full == 0ULL) {
+ printf("freelist_full is empty!\n");
+ return;
+ }
+
+ next = bs_super->freelist_full;
+ for (;;) {
+ fb = readblock(next);
+ total += fb->count;
+ if (print_each == 1)
+ {
+ int i;
+ for (i=0; i< fb->count; i++)
+ printf(" %Ld\n", fb->list[i]);
+ }
+ next = fb->next;
+ freeblock(fb);
+ if (next == 0ULL) break;
+ }
+ printf("Total of %Ld ids on freelist.\n", total);
+}
+
+/*****************************************************************************
+ * Initialisation *
+ *****************************************************************************/
+
+int __init_blockstore(void)
+{
+ int i;
+ blockstore_super_t *bs_super;
+ u64 ret;
+ int block_fp;
+
+#ifdef BLOCKSTORE_REMOTE
+ struct hostent *addr;
+
+ pthread_mutex_init(&ptmutex_queue, NULL);
+ pthread_mutex_init(&ptmutex_luid, NULL);
+ pthread_mutex_init(&ptmutex_recv, NULL);
+ /*pthread_mutex_init(&ptmutex_notify, NULL);*/
+ for (i = 0; i <= READ_POOL_SIZE; i++) {
+ pool_thread[i].newdata = 0;
+ pthread_mutex_init(&(pool_thread[i].ptmutex), NULL);
+ pthread_cond_init(&(pool_thread[i].ptcv), NULL);
+ }
+
+ bsservers[0].hostname = "firebug.cl.cam.ac.uk";
+ bsservers[1].hostname = "planb.cl.cam.ac.uk";
+ bsservers[2].hostname = "simcity.cl.cam.ac.uk";
+ bsservers[3].hostname = NULL/*"gunfighter.cl.cam.ac.uk"*/;
+ bsservers[4].hostname = NULL/*"galaxian.cl.cam.ac.uk"*/;
+ bsservers[5].hostname = NULL/*"firetrack.cl.cam.ac.uk"*/;
+ bsservers[6].hostname = NULL/*"funfair.cl.cam.ac.uk"*/;
+ bsservers[7].hostname = NULL/*"felix.cl.cam.ac.uk"*/;
+ bsservers[8].hostname = NULL;
+ bsservers[9].hostname = NULL;
+ bsservers[10].hostname = NULL;
+ bsservers[11].hostname = NULL;
+ bsservers[12].hostname = NULL;
+ bsservers[13].hostname = NULL;
+ bsservers[14].hostname = NULL;
+ bsservers[15].hostname = NULL;
+
+ for (i = 0; i < MAX_SERVERS; i++) {
+ if (!bsservers[i].hostname)
+ continue;
+ addr = gethostbyname(bsservers[i].hostname);
+ if (!addr) {
+ perror("bad hostname");
+ return -1;
+ }
+ bsservers[i].sin.sin_family = addr->h_addrtype;
+ bsservers[i].sin.sin_port = htons(BLOCKSTORED_PORT);
+ bsservers[i].sin.sin_addr.s_addr =
+ ((struct in_addr *)(addr->h_addr))->s_addr;
+ }
+
+ /* Cluster map
+ */
+ bsclusters[0].servers[0] = 0;
+ bsclusters[0].servers[1] = 1;
+ bsclusters[0].servers[2] = 2;
+ bsclusters[1].servers[0] = 1;
+ bsclusters[1].servers[1] = 2;
+ bsclusters[1].servers[2] = 3;
+ bsclusters[2].servers[0] = 2;
+ bsclusters[2].servers[1] = 3;
+ bsclusters[2].servers[2] = 4;
+ bsclusters[3].servers[0] = 3;
+ bsclusters[3].servers[1] = 4;
+ bsclusters[3].servers[2] = 5;
+ bsclusters[4].servers[0] = 4;
+ bsclusters[4].servers[1] = 5;
+ bsclusters[4].servers[2] = 6;
+ bsclusters[5].servers[0] = 5;
+ bsclusters[5].servers[1] = 6;
+ bsclusters[5].servers[2] = 7;
+ bsclusters[6].servers[0] = 6;
+ bsclusters[6].servers[1] = 7;
+ bsclusters[6].servers[2] = 0;
+ bsclusters[7].servers[0] = 7;
+ bsclusters[7].servers[1] = 0;
+ bsclusters[7].servers[2] = 1;
+
+ /* Local socket set up
+ */
+ bssock = socket(AF_INET, SOCK_DGRAM, 0);
+ if (bssock < 0) {
+ perror("Bad socket");
+ return -1;
+ }
+ memset(&sin_local, 0, sizeof(sin_local));
+ sin_local.sin_family = AF_INET;
+ sin_local.sin_port = htons(BLOCKSTORED_PORT);
+ sin_local.sin_addr.s_addr = htonl(INADDR_ANY);
+ if (bind(bssock, (struct sockaddr *)&sin_local, sizeof(sin_local)) < 0) {
+ perror("bind");
+ close(bssock);
+ return -1;
+ }
+
+ pthread_create(&pthread_recv, NULL, receive_loop, NULL);
+ pthread_create(&pthread_recv, NULL, queue_runner, NULL);
+
+#else /* /BLOCKSTORE_REMOTE */
+ block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
+
+ if (block_fp < 0) {
+ perror("open");
+ return -1;
+ exit(-1);
+ }
+
+ if (lseek(block_fp, 0, SEEK_END) == 0) {
+ bs_super = newblock();
+ bs_super->magic = BLOCKSTORE_MAGIC;
+ bs_super->freelist_full = 0LL;
+ bs_super->freelist_current = 0LL;
+
+ ret = allocblock(bs_super);
+
+ freeblock(bs_super);
+ } else {
+ bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
+ if (bs_super->magic != BLOCKSTORE_MAGIC)
+ {
+ printf("BLOCKSTORE IS CORRUPT! (no magic in superblock!)\n");
+ exit(-1);
+ }
+ freeblock(bs_super);
+ }
+
+ close(block_fp);
+
+#endif /* BLOCKSTORE_REMOTE */
+ return 0;
+}
+
+void __exit_blockstore(void)
+{
+ int i;
+#ifdef BLOCKSTORE_REMOTE
+ pthread_mutex_destroy(&ptmutex_recv);
+ pthread_mutex_destroy(&ptmutex_luid);
+ pthread_mutex_destroy(&ptmutex_queue);
+ /*pthread_mutex_destroy(&ptmutex_notify);
+ pthread_cond_destroy(&ptcv_notify);*/
+ for (i = 0; i <= READ_POOL_SIZE; i++) {
+ pthread_mutex_destroy(&(pool_thread[i].ptmutex));
+ pthread_cond_destroy(&(pool_thread[i].ptcv));
+ }
+#endif
+}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/blockstore.h
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/blockstore.h Sun Jul 3 14:14:09 2005
@@ -0,0 +1,134 @@
+/**************************************************************************
+ *
+ * blockstore.h
+ *
+ * Simple block store interface
+ *
+ */
+
+#ifndef __BLOCKSTORE_H__
+#define __BLOCKSTORE_H__
+
+#include <netinet/in.h>
+#include <xc.h>
+
+#define BLOCK_SIZE 4096
+#define BLOCK_SHIFT 12
+#define BLOCK_MASK 0xfffffffffffff000LL
+
+/* XXX SMH: where is the below supposed to be defined???? */
+#ifndef SECTOR_SHIFT
+#define SECTOR_SHIFT 9
+#endif
+
+#define FREEBLOCK_SIZE (BLOCK_SIZE / sizeof(u64)) - (3 * sizeof(u64))
+#define FREEBLOCK_MAGIC 0x0fee0fee0fee0feeULL
+
+typedef struct {
+ u64 magic;
+ u64 next;
+ u64 count;
+ u64 list[FREEBLOCK_SIZE];
+} freeblock_t;
+
+#define BLOCKSTORE_MAGIC 0xaaaaaaa00aaaaaaaULL
+#define BLOCKSTORE_SUPER 1ULL
+
+typedef struct {
+ u64 magic;
+ u64 freelist_full;
+ u64 freelist_current;
+} blockstore_super_t;
+
+extern void *newblock();
+extern void *readblock(u64 id);
+extern u64 allocblock(void *block);
+extern u64 allocblock_hint(void *block, u64 hint);
+extern int writeblock(u64 id, void *block);
+
+/* Add this blockid to a freelist, to be recycled by the allocator. */
+extern void releaseblock(u64 id);
+
+/* this is a memory free() operation for block-sized allocations */
+extern void freeblock(void *block);
+extern int __init_blockstore(void);
+
+/* debug for freelist. */
+void freelist_count(int print_each);
+#define ALLOCFAIL (((u64)(-1)))
+
+/* Distribution
+ */
+#define BLOCKSTORED_PORT 9346
+
+struct bshdr_t_struct {
+ u32 operation;
+ u32 flags;
+ u64 id;
+ u64 luid;
+} __attribute__ ((packed));
+typedef struct bshdr_t_struct bshdr_t;
+
+struct bsmsg_t_struct {
+ bshdr_t hdr;
+ unsigned char block[BLOCK_SIZE];
+} __attribute__ ((packed));
+
+typedef struct bsmsg_t_struct bsmsg_t;
+
+#define MSGBUFSIZE_OP sizeof(u32)
+#define MSGBUFSIZE_FLAGS (sizeof(u32) + sizeof(u32))
+#define MSGBUFSIZE_ID (sizeof(u32) + sizeof(u32) + sizeof(u64) +
sizeof(u64))
+#define MSGBUFSIZE_BLOCK sizeof(bsmsg_t)
+
+#define BSOP_READBLOCK 0x01
+#define BSOP_WRITEBLOCK 0x02
+#define BSOP_ALLOCBLOCK 0x03
+#define BSOP_FREEBLOCK 0x04
+
+#define BSOP_FLAG_ERROR 0x01
+
+#define BS_ALLOC_SKIP 10
+#define BS_ALLOC_HACK
+
+/* Remote hosts and cluster map - XXX need to generalise
+ */
+
+/*
+
+ Interim ID format is
+
+ 63 60 59 40 39 20 19 0
+ +----+--------------------+--------------------+--------------------+
+ |map | replica 2 | replica 1 | replica 0 |
+ +----+--------------------+--------------------+--------------------+
+
+ The map is an index into a table detailing which machines form the
+ cluster.
+
+ */
+
+#define BSID_REPLICA0(_id) ((_id)&0xfffffULL)
+#define BSID_REPLICA1(_id) (((_id)>>20)&0xfffffULL)
+#define BSID_REPLICA2(_id) (((_id)>>40)&0xfffffULL)
+#define BSID_MAP(_id) (((_id)>>60)&0xfULL)
+
+#define BSID(_map, _rep0, _rep1, _rep2) ((((u64)(_map))<<60) | \
+ (((u64)(_rep2))<<40) | \
+ (((u64)(_rep1))<<20) | ((u64)(_rep0)))
+
+typedef struct bsserver_t_struct {
+ char *hostname;
+ struct sockaddr_in sin;
+} bsserver_t;
+
+#define MAX_SERVERS 16
+
+#define CLUSTER_MAX_REPLICAS 3
+typedef struct bscluster_t_struct {
+ int servers[CLUSTER_MAX_REPLICAS];
+} bscluster_t;
+
+#define MAX_CLUSTERS 16
+
+#endif /* __BLOCKSTORE_H__ */
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/parallax.c
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/parallax.c Sun Jul 3 14:14:09 2005
@@ -0,0 +1,611 @@
+/**************************************************************************
+ *
+ * parallax.c
+ *
+ * The Parallax Storage Server
+ *
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include "blktaplib.h"
+#include "blockstore.h"
+#include "vdi.h"
+#include "block-async.h"
+#include "requests-async.h"
+
+#define PARALLAX_DEV 61440
+#define SECTS_PER_NODE 8
+
+
+#if 0
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+/* ------[ session records ]----------------------------------------------- */
+
+#define BLKIF_HASHSZ 1024
+#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1))
+
+#define VDI_HASHSZ 16
+#define VDI_HASH(_vd) ((((_vd)>>8)^(_vd))&(VDI_HASHSZ-1))
+
+typedef struct blkif {
+ domid_t domid;
+ unsigned int handle;
+ enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
+ vdi_t *vdi_hash[VDI_HASHSZ];
+ struct blkif *hash_next;
+} blkif_t;
+
+static blkif_t *blkif_hash[BLKIF_HASHSZ];
+
+blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle)
+{
+ if ( handle != 0 )
+ printf("blktap/parallax don't currently support non-0 dev handles!\n");
+
+ blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)];
+ while ( (blkif != NULL) &&
+ ((blkif->domid != domid) || (blkif->handle != handle)) )
+ blkif = blkif->hash_next;
+ return blkif;
+}
+
+vdi_t *blkif_get_vdi(blkif_t *blkif, blkif_vdev_t device)
+{
+ vdi_t *vdi = blkif->vdi_hash[VDI_HASH(device)];
+
+ while ((vdi != NULL) && (vdi->vdevice != device))
+ vdi = vdi->next;
+
+ return vdi;
+}
+
+/* ------[ control message handling ]-------------------------------------- */
+
+void blkif_create(blkif_be_create_t *create)
+{
+ domid_t domid = create->domid;
+ unsigned int handle = create->blkif_handle;
+ blkif_t **pblkif, *blkif;
+
+ DPRINTF("parallax (blkif_create): create is %p\n", create);
+
+ if ( (blkif = (blkif_t *)malloc(sizeof(blkif_t))) == NULL )
+ {
+ DPRINTF("Could not create blkif: out of memory\n");
+ create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+ return;
+ }
+
+ memset(blkif, 0, sizeof(*blkif));
+ blkif->domid = domid;
+ blkif->handle = handle;
+ blkif->status = DISCONNECTED;
+
+ pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+ while ( *pblkif != NULL )
+ {
+ if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) )
+ {
+ DPRINTF("Could not create blkif: already exists (%d,%d)\n",
+ domid, handle);
+ create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS;
+ free(blkif);
+ return;
+ }
+ pblkif = &(*pblkif)->hash_next;
+ }
+
+ blkif->hash_next = *pblkif;
+ *pblkif = blkif;
+
+ DPRINTF("Successfully created blkif\n");
+ create->status = BLKIF_BE_STATUS_OKAY;
+}
+
+void blkif_destroy(blkif_be_destroy_t *destroy)
+{
+ domid_t domid = destroy->domid;
+ unsigned int handle = destroy->blkif_handle;
+ blkif_t **pblkif, *blkif;
+
+ DPRINTF("parallax (blkif_destroy): destroy is %p\n", destroy);
+
+ pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+ while ( (blkif = *pblkif) != NULL )
+ {
+ if ( (blkif->domid == domid) && (blkif->handle == handle) )
+ {
+ if ( blkif->status != DISCONNECTED )
+ goto still_connected;
+ goto destroy;
+ }
+ pblkif = &blkif->hash_next;
+ }
+
+ destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+ return;
+
+ still_connected:
+ destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
+ return;
+
+ destroy:
+ *pblkif = blkif->hash_next;
+ free(blkif);
+ destroy->status = BLKIF_BE_STATUS_OKAY;
+}
+
+void vbd_create(blkif_be_vbd_create_t *create)
+{
+ blkif_t *blkif;
+ vdi_t *vdi, **vdip;
+ blkif_vdev_t vdevice = create->vdevice;
+
+ DPRINTF("parallax (vbd_create): create=%p\n", create);
+
+ blkif = blkif_find_by_handle(create->domid, create->blkif_handle);
+ if ( blkif == NULL )
+ {
+ DPRINTF("vbd_create attempted for non-existent blkif (%u,%u)\n",
+ create->domid, create->blkif_handle);
+ create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+ return;
+ }
+
+ /* VDI identifier is in grow->extent.sector_start */
+ DPRINTF("vbd_create: create->dev_handle (id) is %lx\n",
+ (unsigned long)create->dev_handle);
+
+ vdi = vdi_get(create->dev_handle);
+ if (vdi == NULL)
+ {
+ printf("parallax (vbd_create): VDI %lx not found.\n",
+ (unsigned long)create->dev_handle);
+ create->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
+ return;
+ }
+
+ vdi->next = NULL;
+ vdi->vdevice = vdevice;
+ vdip = &blkif->vdi_hash[VDI_HASH(vdevice)];
+ while (*vdip != NULL)
+ vdip = &(*vdip)->next;
+ *vdip = vdi;
+
+ DPRINTF("blkif_create succeeded\n");
+ create->status = BLKIF_BE_STATUS_OKAY;
+}
+
+void vbd_destroy(blkif_be_vbd_destroy_t *destroy)
+{
+ blkif_t *blkif;
+ vdi_t *vdi, **vdip;
+ blkif_vdev_t vdevice = destroy->vdevice;
+
+ blkif = blkif_find_by_handle(destroy->domid, destroy->blkif_handle);
+ if ( blkif == NULL )
+ {
+ DPRINTF("vbd_destroy attempted for non-existent blkif (%u,%u)\n",
+ destroy->domid, destroy->blkif_handle);
+ destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+ return;
+ }
+
+ vdip = &blkif->vdi_hash[VDI_HASH(vdevice)];
+ while ((*vdip != NULL) && ((*vdip)->vdevice != vdevice))
+ vdip = &(*vdip)->next;
+
+ if (*vdip != NULL)
+ {
+ vdi = *vdip;
+ *vdip = vdi->next;
+ vdi_put(vdi);
+ }
+
+}
+
+int parallax_control(control_msg_t *msg)
+{
+ domid_t domid;
+ int ret;
+
+ DPRINTF("parallax_control: msg is %p\n", msg);
+
+ if (msg->type != CMSG_BLKIF_BE)
+ {
+ printf("Unexpected control message (%d)\n", msg->type);
+ return 0;
+ }
+
+ switch(msg->subtype)
+ {
+ case CMSG_BLKIF_BE_CREATE:
+ if ( msg->length != sizeof(blkif_be_create_t) )
+ goto parse_error;
+ blkif_create((blkif_be_create_t *)msg->msg);
+ break;
+
+ case CMSG_BLKIF_BE_DESTROY:
+ if ( msg->length != sizeof(blkif_be_destroy_t) )
+ goto parse_error;
+ blkif_destroy((blkif_be_destroy_t *)msg->msg);
+ break;
+
+ case CMSG_BLKIF_BE_VBD_CREATE:
+ if ( msg->length != sizeof(blkif_be_vbd_create_t) )
+ goto parse_error;
+ vbd_create((blkif_be_vbd_create_t *)msg->msg);
+ break;
+
+ case CMSG_BLKIF_BE_VBD_DESTROY:
+ if ( msg->length != sizeof(blkif_be_vbd_destroy_t) )
+ goto parse_error;
+ vbd_destroy((blkif_be_vbd_destroy_t *)msg->msg);
+ break;
+
+ case CMSG_BLKIF_BE_CONNECT:
+ case CMSG_BLKIF_BE_DISCONNECT:
+ /* we don't manage the device channel, the tap does. */
+ break;
+
+ default:
+ goto parse_error;
+ }
+ return 0;
+parse_error:
+ printf("Bad control message!\n");
+ return 0;
+
+}
+
+int parallax_probe(blkif_request_t *req, blkif_t *blkif)
+{
+ blkif_response_t *rsp;
+ vdisk_t *img_info;
+ vdi_t *vdi;
+ int i, nr_vdis = 0;
+
+ DPRINTF("parallax_probe: req=%p, blkif=%p\n", req, blkif);
+
+ /* We expect one buffer only. */
+ if ( req->nr_segments != 1 )
+ goto err;
+
+ /* Make sure the buffer is page-sized. */
+ if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) ||
+ (blkif_last_sect (req->frame_and_sects[0]) != 7) )
+ goto err;
+
+ /* fill the list of devices */
+ for (i=0; i<VDI_HASHSZ; i++) {
+ vdi = blkif->vdi_hash[i];
+ while (vdi) {
+ img_info = (vdisk_t *)MMAP_VADDR(ID_TO_IDX(req->id), 0);
+ img_info[nr_vdis].device = vdi->vdevice;
+ img_info[nr_vdis].info = 0;
+ /* The -1 here accounts for the LSB in the radix tree */
+ img_info[nr_vdis].capacity =
+ ((1LL << (VDI_HEIGHT-1)) * SECTS_PER_NODE);
+ nr_vdis++;
+ vdi = vdi->next;
+ }
+ }
+
+
+ rsp = (blkif_response_t *)req;
+ rsp->id = req->id;
+ rsp->operation = BLKIF_OP_PROBE;
+ rsp->status = nr_vdis; /* number of disks */
+
+ DPRINTF("parallax_probe: send positive response (nr_vdis=%d)\n", nr_vdis);
+ return BLKTAP_RESPOND;
+err:
+ rsp = (blkif_response_t *)req;
+ rsp->id = req->id;
+ rsp->operation = BLKIF_OP_PROBE;
+ rsp->status = BLKIF_RSP_ERROR;
+
+ DPRINTF("parallax_probe: send error response\n");
+ return BLKTAP_RESPOND;
+}
+
+typedef struct {
+ blkif_request_t *req;
+ int count;
+ int error;
+ pthread_mutex_t mutex;
+} pending_t;
+
+#define MAX_REQUESTS 64
+pending_t pending_list[MAX_REQUESTS];
+
+struct cb_param {
+ pending_t *pent;
+ int segment;
+ u64 sector;
+ u64 vblock; /* for debug printing -- can be removed. */
+};
+
+static void read_cb(struct io_ret r, void *in_param)
+{
+ struct cb_param *param = (struct cb_param *)in_param;
+ pending_t *p = param->pent;
+ int segment = param->segment;
+ blkif_request_t *req = p->req;
+ unsigned long size, offset, start;
+ char *dpage, *spage;
+
+ spage = IO_BLOCK(r);
+ if (spage == NULL) { p->error++; goto finish; }
+ dpage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), segment);
+
+ /* Calculate read size and offset within the read block. */
+
+ offset = (param->sector << SECTOR_SHIFT) % BLOCK_SIZE;
+ size = ( blkif_last_sect (req->frame_and_sects[segment]) -
+ blkif_first_sect(req->frame_and_sects[segment]) + 1
+ ) << SECTOR_SHIFT;
+ start = blkif_first_sect(req->frame_and_sects[segment])
+ << SECTOR_SHIFT;
+
+ DPRINTF("ParallaxRead: sect: %lld (%ld,%ld), "
+ "vblock %llx, "
+ "size %lx\n",
+ param->sector, blkif_first_sect(p->req->frame_and_sects[segment]),
+ blkif_last_sect (p->req->frame_and_sects[segment]),
+ param->vblock, size);
+
+ memcpy(dpage + start, spage + offset, size);
+ freeblock(spage);
+
+ /* Done the read. Now update the pending record. */
+ finish:
+ pthread_mutex_lock(&p->mutex);
+ p->count--;
+
+ if (p->count == 0) {
+ blkif_response_t *rsp;
+
+ rsp = (blkif_response_t *)req;
+ rsp->id = req->id;
+ rsp->operation = BLKIF_OP_READ;
+ if (p->error == 0) {
+ rsp->status = BLKIF_RSP_OKAY;
+ } else {
+ rsp->status = BLKIF_RSP_ERROR;
+ }
+ blktap_inject_response(rsp);
+ }
+
+ pthread_mutex_unlock(&p->mutex);
+
+ free(param); /* TODO: replace with cached alloc/dealloc */
+}
+
+int parallax_read(blkif_request_t *req, blkif_t *blkif)
+{
+ blkif_response_t *rsp;
+ u64 vblock, gblock;
+ vdi_t *vdi;
+ u64 sector;
+ int i;
+ char *dpage, *spage;
+ pending_t *pent;
+
+ vdi = blkif_get_vdi(blkif, req->device);
+
+ if ( vdi == NULL )
+ goto err;
+
+ pent = &pending_list[ID_TO_IDX(req->id)];
+ pent->count = req->nr_segments;
+ pent->req = req;
+ pthread_mutex_init(&pent->mutex, NULL);
+
+ for (i = 0; i < req->nr_segments; i++) {
+ pthread_t tid;
+ int ret;
+ struct cb_param *p;
+
+ /* Round the requested segment to a block address. */
+ sector = req->sector_number + (8*i);
+ vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT;
+
+ /* TODO: Replace this call to malloc with a cached allocation */
+ p = (struct cb_param *)malloc(sizeof(struct cb_param));
+ p->pent = pent;
+ p->sector = sector;
+ p->segment = i;
+ p->vblock = vblock; /* dbg */
+
+ /* Get that block from the store. */
+ vdi_read(vdi, vblock, read_cb, (void *)p);
+ }
+
+ return BLKTAP_STOLEN;
+
+err:
+ rsp = (blkif_response_t *)req;
+ rsp->id = req->id;
+ rsp->operation = BLKIF_OP_READ;
+ rsp->status = BLKIF_RSP_ERROR;
+
+ return BLKTAP_RESPOND;
+}
+
+static void write_cb(struct io_ret r, void *in_param)
+{
+ struct cb_param *param = (struct cb_param *)in_param;
+ pending_t *p = param->pent;
+ blkif_request_t *req = p->req;
+
+ /* catch errors from the block code. */
+ if (IO_INT(r) < 0) p->error++;
+
+ pthread_mutex_lock(&p->mutex);
+ p->count--;
+
+ if (p->count == 0) {
+ blkif_response_t *rsp;
+
+ rsp = (blkif_response_t *)req;
+ rsp->id = req->id;
+ rsp->operation = BLKIF_OP_WRITE;
+ if (p->error == 0) {
+ rsp->status = BLKIF_RSP_OKAY;
+ } else {
+ rsp->status = BLKIF_RSP_ERROR;
+ }
+ blktap_inject_response(rsp);
+ }
+
+ pthread_mutex_unlock(&p->mutex);
+
+ free(param); /* TODO: replace with cached alloc/dealloc */
+}
+
+int parallax_write(blkif_request_t *req, blkif_t *blkif)
+{
+ blkif_response_t *rsp;
+ u64 sector;
+ int i, writable = 0;
+ u64 vblock, gblock;
+ char *spage;
+ unsigned long size, offset, start;
+ vdi_t *vdi;
+ pending_t *pent;
+
+ vdi = blkif_get_vdi(blkif, req->device);
+
+ if ( vdi == NULL )
+ goto err;
+
+ pent = &pending_list[ID_TO_IDX(req->id)];
+ pent->count = req->nr_segments;
+ pent->req = req;
+ pthread_mutex_init(&pent->mutex, NULL);
+
+ for (i = 0; i < req->nr_segments; i++) {
+ struct cb_param *p;
+
+ spage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
+
+ /* Round the requested segment to a block address. */
+
+ sector = req->sector_number + (8*i);
+ vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT;
+
+ /* Calculate read size and offset within the read block. */
+
+ offset = (sector << SECTOR_SHIFT) % BLOCK_SIZE;
+ size = ( blkif_last_sect (req->frame_and_sects[i]) -
+ blkif_first_sect(req->frame_and_sects[i]) + 1
+ ) << SECTOR_SHIFT;
+ start = blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT;
+
+ DPRINTF("ParallaxWrite: sect: %lld (%ld,%ld), "
+ "vblock %llx, gblock %llx, "
+ "size %lx\n",
+ sector, blkif_first_sect(req->frame_and_sects[i]),
+ blkif_last_sect (req->frame_and_sects[i]),
+ vblock, gblock, size);
+
+ /* XXX: For now we just freak out if they try to write a */
+ /* non block-sized, block-aligned page. */
+
+ if ((offset != 0) || (size != BLOCK_SIZE) || (start != 0)) {
+ printf("]\n] STRANGE WRITE!\n]\n");
+ goto err;
+ }
+
+ /* TODO: Replace this call to malloc with a cached allocation */
+ p = (struct cb_param *)malloc(sizeof(struct cb_param));
+ p->pent = pent;
+ p->sector = sector;
+ p->segment = i;
+ p->vblock = vblock; /* dbg */
+
+ /* Issue the write to the store. */
+ vdi_write(vdi, vblock, spage, write_cb, (void *)p);
+ }
+
+ return BLKTAP_STOLEN;
+
+err:
+ rsp = (blkif_response_t *)req;
+ rsp->id = req->id;
+ rsp->operation = BLKIF_OP_WRITE;
+ rsp->status = BLKIF_RSP_ERROR;
+
+ return BLKTAP_RESPOND;
+}
+
+int parallax_request(blkif_request_t *req)
+{
+ blkif_response_t *rsp;
+ domid_t dom = ID_TO_DOM(req->id);
+ blkif_t *blkif = blkif_find_by_handle(dom, 0);
+
+ if (blkif == NULL)
+ goto err;
+
+ if ( req->operation == BLKIF_OP_PROBE ) {
+
+ return parallax_probe(req, blkif);
+
+ } else if ( req->operation == BLKIF_OP_READ ) {
+
+ return parallax_read(req, blkif);
+
+ } else if ( req->operation == BLKIF_OP_WRITE ) {
+
+ return parallax_write(req, blkif);
+
+ } else {
+ printf("Unknown request message type!\n");
+ /* Unknown operation */
+ goto err;
+ }
+
+err:
+ rsp = (blkif_response_t *)req;
+ rsp->operation = req->operation;
+ rsp->id = req->id;
+ rsp->status = BLKIF_RSP_ERROR;
+ return BLKTAP_RESPOND;
+}
+
+void __init_parallax(void)
+{
+ memset(blkif_hash, 0, sizeof(blkif_hash));
+}
+
+
+
+int main(int argc, char *argv[])
+{
+ DPRINTF("parallax: starting.\n");
+ __init_blockstore();
+ DPRINTF("parallax: initialized blockstore...\n");
+ init_block_async();
+ DPRINTF("parallax: initialized async blocks...\n");
+ __init_vdi();
+ DPRINTF("parallax: initialized vdi registry etc...\n");
+ __init_parallax();
+ DPRINTF("parallax: initialized local stuff..\n");
+
+ blktap_register_ctrl_hook("parallax_control", parallax_control);
+ blktap_register_request_hook("parallax_request", parallax_request);
+ DPRINTF("parallax: added ctrl + request hooks, starting listen...\n");
+ blktap_listen();
+
+ return 0;
+}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/vdi.c
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/vdi.c Sun Jul 3 14:14:09 2005
@@ -0,0 +1,367 @@
+/**************************************************************************
+ *
+ * vdi.c
+ *
+ * Virtual Disk Image (VDI) Interfaces
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/time.h>
+#include <pthread.h>
+#include "blockstore.h"
+#include "block-async.h"
+#include "requests-async.h"
+#include "radix.h"
+#include "vdi.h"
+
+#define VDI_REG_BLOCK 2LL
+#define VDI_RADIX_ROOT writable(3)
+
+#if 0
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+/* I haven't decided about this registry stuff, so this is just a really
+ * quick lash-up so that there is some way to track VDIs.
+ *
+ * (Most vdi access should be with a direct handle to the block, so this
+ * registry is just for start-of-day lookup and other control operations.)
+ */
+
+vdi_registry_t *create_vdi_registry(void)
+{
+ vdi_registry_t *reg = (vdi_registry_t *)newblock();
+
+ if (reg == NULL)
+ return NULL;
+
+ /* zero-fill the vdi radix root while we have an empty block. */
+ writeblock(VDI_RADIX_ROOT, (void *)reg);
+
+
+ DPRINTF("[vdi.c] Creating VDI registry!\n");
+ reg->magic = VDI_REG_MAGIC;
+ reg->nr_vdis = 0;
+
+ writeblock(VDI_REG_BLOCK, (void *)reg);
+
+ return reg;
+}
+
+vdi_registry_t *get_vdi_registry(void)
+{
+ vdi_registry_t *vdi_reg = (vdi_registry_t *)readblock(VDI_REG_BLOCK);
+
+ if ( vdi_reg == NULL )
+ vdi_reg = create_vdi_registry();
+
+ if ( vdi_reg->magic != VDI_REG_MAGIC ) {
+ freeblock(vdi_reg);
+ return NULL;
+ }
+
+ return vdi_reg;
+}
+
+
+vdi_t *vdi_create(snap_id_t *parent_snap, char *name)
+{
+ int ret;
+ vdi_t *vdi;
+ vdi_registry_t *vdi_reg;
+ snap_rec_t snap_rec;
+
+ /* create a vdi struct */
+ vdi = newblock();
+ if (vdi == NULL)
+ return NULL;
+
+ if ( snap_get_id(parent_snap, &snap_rec) == 0 ) {
+ vdi->radix_root = snapshot(snap_rec.radix_root);
+ } else {
+ vdi->radix_root = allocblock((void *)vdi); /* vdi is just zeros here */
+ vdi->radix_root = writable(vdi->radix_root); /* grr. */
+ }
+
+ /* create a snapshot log, and add it to the vdi struct */
+
+ ret = snap_block_create(parent_snap, &vdi->snap);
+ if ( ret != 0 ) {
+ DPRINTF("Error getting snap block in vdi_create.\n");
+ freeblock(vdi);
+ return NULL;
+ }
+
+ /* append the vdi to the registry, fill block and id. */
+ /* implicit allocation means we have to write the vdi twice here. */
+ vdi_reg = get_vdi_registry();
+ if ( vdi_reg == NULL ) {
+ freeblock(vdi);
+ return NULL;
+ }
+
+ vdi->block = allocblock((void *)vdi);
+ vdi->id = vdi_reg->nr_vdis++;
+ strncpy(vdi->name, name, VDI_NAME_SZ);
+ vdi->name[VDI_NAME_SZ] = '\0';
+ vdi->radix_lock = NULL; /* for tidiness */
+ writeblock(vdi->block, (void *)vdi);
+
+ update(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi->id, vdi->block);
+ writeblock(VDI_REG_BLOCK, (void *)vdi_reg);
+ freeblock(vdi_reg);
+
+ vdi->radix_lock = (struct radix_lock *)malloc(sizeof(struct radix_lock));
+ if (vdi->radix_lock == NULL)
+ {
+ perror("couldn't malloc radix_lock for new vdi!");
+ freeblock(vdi);
+ return NULL;
+ }
+ radix_lock_init(vdi->radix_lock);
+
+ return vdi;
+}
+
+/* vdi_get and vdi_put currently act more like alloc/free -- they don't
+ * do refcount-based allocation.
+ */
+vdi_t *vdi_get(u64 vdi_id)
+{
+ u64 vdi_blk;
+ vdi_t *vdi;
+
+ vdi_blk = lookup(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi_id);
+
+ if ( vdi_blk == 0 )
+ return NULL;
+
+ vdi = (vdi_t *)readblock(vdi_blk);
+
+ vdi->radix_lock = (struct radix_lock *)malloc(sizeof(struct radix_lock));
+ if (vdi->radix_lock == NULL)
+ {
+ perror("couldn't malloc radix_lock for new vdi!");
+ freeblock(vdi);
+ return NULL;
+ }
+ radix_lock_init(vdi->radix_lock);
+
+ return vdi;
+}
+
+void vdi_put(vdi_t *vdi)
+{
+ free(vdi->radix_lock);
+ freeblock(vdi);
+}
+
+void vdi_snapshot(vdi_t *vdi)
+{
+ snap_rec_t rec;
+ int ret;
+
+ rec.radix_root = vdi->radix_root;
+ gettimeofday(&rec.timestamp, NULL);
+ rec.deleted = 0;
+
+ vdi->radix_root = snapshot(vdi->radix_root);
+ ret = snap_append(&vdi->snap, &rec, &vdi->snap);
+ if ( ret != 0 ) {
+ printf("snap_append returned failure\n");
+ return;
+ }
+ writeblock(vdi->block, vdi);
+}
+
+int __init_vdi()
+{
+ /* sneak this in here for the moment. */
+ __rcache_init();
+
+ /* force the registry to be created if it doesn't exist. */
+ vdi_registry_t *vdi_reg = get_vdi_registry();
+ if (vdi_reg == NULL) {
+ printf("[vdi.c] Couldn't get/create a VDI registry!\n");
+ return -1;
+ }
+ freeblock(vdi_reg);
+
+
+ return 0;
+}
+
+#ifdef VDI_STANDALONE
+
+#define TEST_VDIS 50
+#define NR_ITERS 50000
+#define FORK_POINTS 200
+#define INIT_VDIS 3
+#define INIT_SNAPS 40
+
+/* These must be of decreasing size: */
+#define NEW_FORK (RAND_MAX-(RAND_MAX/1000))
+#define NEW_ROOT_VDI (RAND_MAX-((RAND_MAX/1000)*2))
+#define NEW_FORK_VDI (RAND_MAX-((RAND_MAX/1000)*3))
+
+#define GRAPH_DOT_FILE "vdi.dot"
+#define GRAPH_PS_FILE "vdi.ps"
+
+
+typedef struct sh_st {
+ snap_id_t id;
+ struct sh_st *next;
+} sh_t;
+
+#define SNAP_HASHSZ 1024
+sh_t *node_hash[SNAP_HASHSZ];
+#define SNAP_HASH(_id) (((int)(_id)->block^(_id)->index)%SNAP_HASHSZ)
+
+#define SNAPID_EQUAL(_a,_b) \
+ (((_a)->block==(_b)->block) && ((_a)->index==(_b)->index))
+int sh_check_and_add(snap_id_t *id)
+{
+ sh_t **s = &node_hash[SNAP_HASH(id)];
+
+ while (*s != NULL) {
+ if (SNAPID_EQUAL(&((*s)->id), id))
+ return 1;
+ *s = (*s)->next;
+ }
+
+ *s = (sh_t *)malloc(sizeof(sh_t));
+ (*s)->id = *id;
+ (*s)->next = NULL;
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ vdi_t *vdi_list[TEST_VDIS];
+ snap_id_t id, fork_points[FORK_POINTS];
+ int nr_vdis = 0, nr_forks = 0;
+ int i, j, r;
+ FILE *f;
+ char name[VDI_NAME_SZ];
+
+ __init_blockstore();
+ __init_vdi();
+
+ printf("[o] Generating seed VDIs. (%d VDIs)\n", INIT_VDIS);
+
+ for (i=0; i<INIT_VDIS; i++) {
+ r=rand();
+
+ sprintf(name, "VDI Number %d", nr_vdis);
+ vdi_list[i] = vdi_create(NULL, name);
+ for (j=0; j<(r%INIT_SNAPS); j++)
+ vdi_snapshot(vdi_list[i]);
+ fork_points[i] = vdi_list[i]->snap;
+ nr_vdis++;
+ nr_forks++;
+ }
+
+ printf("[o] Running a random workload. (%d iterations)\n", NR_ITERS);
+
+ for (i=0; i<NR_ITERS; i++) {
+ r = rand();
+
+ if ( r > NEW_FORK ) {
+ if ( nr_forks > FORK_POINTS )
+ continue;
+ id = vdi_list[r%nr_vdis]->snap;
+ if ( ( id.block == 0 ) || ( id.index == 0 ) )
+ continue;
+ id.index--;
+ fork_points[nr_forks++] = id;
+
+ } else if ( r > NEW_ROOT_VDI ) {
+
+ if ( nr_vdis == TEST_VDIS )
+ continue;
+
+ sprintf(name, "VDI Number %d.", nr_vdis);
+ vdi_list[nr_vdis++] = vdi_create(NULL, name);
+
+ } else if ( r > NEW_FORK_VDI ) {
+
+ if ( nr_vdis == TEST_VDIS )
+ continue;
+
+ sprintf(name, "VDI Number %d.", nr_vdis);
+ vdi_list[nr_vdis++] = vdi_create(&fork_points[r%nr_forks], name);
+
+ } else /* SNAPSHOT */ {
+
+ vdi_snapshot(vdi_list[r%nr_vdis]);
+
+ }
+ }
+
+ /* now dump it out to a dot file. */
+ printf("[o] Dumping state to a dot graph. (%d VDIs)\n", nr_vdis);
+
+ f = fopen(GRAPH_DOT_FILE, "w");
+
+ /* write graph preamble */
+ fprintf(f, "digraph G {\n");
+ fprintf(f, " rankdir=LR\n");
+
+ for (i=0; i<nr_vdis; i++) {
+ char oldnode[255];
+ snap_block_t *blk;
+ snap_id_t id = vdi_list[i]->snap;
+ int nr_snaps, done=0;
+
+ /* add a node for the id */
+printf("vdi: %d\n", i);
+ fprintf(f, " n%Ld%d
[color=blue,shape=box,label=\"%s\\nb:%Ld\\nidx:%d\"]\n",
+ id.block, id.index, vdi_list[i]->name,
+ id.block, id.index);
+ sprintf(oldnode, "n%Ld%d", id.block, id.index);
+
+ while (id.block != 0) {
+ blk = snap_get_block(id.block);
+ nr_snaps = blk->hdr.log_entries - (blk->hdr.nr_entries - id.index);
+ id = blk->hdr.fork_block;
+
+ done = sh_check_and_add(&id);
+
+ /* add a node for the fork_id */
+ if (!done) {
+ fprintf(f, " n%Ld%d [shape=box,label=\"b:%Ld\\nidx:%d\"]\n",
+ id.block, id.index,
+ id.block, id.index);
+ }
+
+ /* add an edge between them */
+ fprintf(f, " n%Ld%d -> %s [label=\"%u snapshots\"]\n",
+ id.block, id.index, oldnode, nr_snaps);
+ sprintf(oldnode, "n%Ld%d", id.block, id.index);
+ freeblock(blk);
+
+ if (done) break;
+ }
+ }
+
+ /* write graph postamble */
+ fprintf(f, "}\n");
+ fclose(f);
+
+ printf("[o] Generating postscript graph. (%s)\n", GRAPH_PS_FILE);
+ {
+ char cmd[255];
+ sprintf(cmd, "dot %s -Tps -o %s", GRAPH_DOT_FILE, GRAPH_PS_FILE);
+ system(cmd);
+ }
+ return 0;
+}
+
+#endif
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/vdi.h
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/vdi.h Sun Jul 3 14:14:09 2005
@@ -0,0 +1,55 @@
+#ifndef _VDI_H_
+#define _VDI_H_
+/**************************************************************************
+ *
+ * vdi.h
+ *
+ * Virtual Disk Image (VDI) Interfaces
+ *
+ */
+
+#ifndef __VDI_H__
+#define __VDI_H__
+
+#include "blktaplib.h"
+#include "snaplog.h"
+
+#define VDI_HEIGHT 27 /* Note that these are now hard-coded */
+#define VDI_REG_HEIGHT 27 /* in the async lookup code */
+
+#define VDI_NAME_SZ 256
+
+
+typedef struct vdi {
+ u64 id; /* unique vdi id -- used by the registry */
+ u64 block; /* block where this vdi lives (also unique)*/
+ u64 radix_root; /* radix root node for block mappings */
+ snap_id_t snap; /* next snapshot slot for this VDI */
+ struct vdi *next; /* used to hash-chain in blkif. */
+ blkif_vdev_t vdevice; /* currently mounted as... */
+ struct radix_lock *radix_lock;/* per-line L1 RW lock for parallel reqs */
+ char name[VDI_NAME_SZ];/* human readable vdi name */
+} vdi_t;
+
+#define VDI_REG_MAGIC 0xff00ff0bb0ff00ffLL
+
+typedef struct vdi_registry {
+ u64 magic;
+ u64 nr_vdis;
+} vdi_registry_t;
+
+
+int __init_vdi(void);
+
+vdi_t *vdi_get(u64 vdi_id);
+void vdi_put(vdi_t *vdi);
+vdi_registry_t *get_vdi_registry(void);
+vdi_t *vdi_create(snap_id_t *parent_snap, char *name);
+u64 vdi_lookup_block(vdi_t *vdi, u64 vdi_block, int *writable);
+void vdi_update_block(vdi_t *vdi, u64 vdi_block, u64 g_block);
+void vdi_snapshot(vdi_t *vdi);
+
+
+#endif /* __VDI_H__ */
+
+#endif //_VDI_H_
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/requests-async.c
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/requests-async.c Sun Jul 3 14:14:09 2005
@@ -0,0 +1,762 @@
+/* requests-async.c
+ *
+ * asynchronous request dispatcher for radix access in parallax.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <assert.h>
+#include <pthread.h>
+#include <err.h>
+#include <zlib.h> /* for crc32() */
+#include "requests-async.h"
+#include "vdi.h"
+#include "radix.h"
+
+#define L1_IDX(_a) (((_a) & 0x0000000007fc0000ULL) >> 18)
+#define L2_IDX(_a) (((_a) & 0x000000000003fe00ULL) >> 9)
+#define L3_IDX(_a) (((_a) & 0x00000000000001ffULL))
+
+
+#if 0
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+struct block_info {
+ u32 crc;
+ u32 unused;
+};
+
+struct io_req {
+ enum { IO_OP_READ, IO_OP_WRITE } op;
+ u64 root;
+ u64 vaddr;
+ int state;
+ io_cb_t cb;
+ void *param;
+ struct radix_lock *lock;
+
+ /* internal stuff: */
+ struct io_ret retval;/* holds the return while we unlock. */
+ char *block; /* the block to write */
+ radix_tree_node radix[3];
+ u64 radix_addr[3];
+ struct block_info bi;
+};
+
+void clear_w_bits(radix_tree_node node)
+{
+ int i;
+ for (i=0; i<RADIX_TREE_MAP_ENTRIES; i++)
+ node[i] = node[i] & ONEMASK;
+ return;
+}
+
+void clear_L3_w_bits(radix_tree_node node)
+{
+ int i;
+ for (i=0; i<RADIX_TREE_MAP_ENTRIES; i+=2)
+ node[i] = node[i] & ONEMASK;
+ return;
+}
+
+enum states {
+ /* both */
+ READ_L1,
+ READ_L2,
+ READ_L3,
+
+ /* read */
+ READ_LOCKED,
+ READ_DATA,
+ READ_UNLOCKED,
+ RETURN_ZERO,
+
+ /* write */
+ WRITE_LOCKED,
+ WRITE_DATA,
+ WRITE_L3,
+ WRITE_UNLOCKED,
+
+ /* L3 Zero Path */
+ ALLOC_DATA_L3z,
+ WRITE_L3_L3z,
+
+ /* L3 Fault Path */
+ ALLOC_DATA_L3f,
+ WRITE_L3_L3f,
+
+ /* L2 Zero Path */
+ ALLOC_DATA_L2z,
+ WRITE_L2_L2z,
+ ALLOC_L3_L2z,
+ WRITE_L2_L3z,
+
+ /* L2 Fault Path */
+ READ_L3_L2f,
+ ALLOC_DATA_L2f,
+ WRITE_L2_L2f,
+ ALLOC_L3_L2f,
+ WRITE_L2_L3f,
+
+ /* L1 Zero Path */
+ ALLOC_DATA_L1z,
+ ALLOC_L3_L1z,
+ ALLOC_L2_L1z,
+ WRITE_L1_L1z,
+
+ /* L1 Fault Path */
+ READ_L2_L1f,
+ READ_L3_L1f,
+ ALLOC_DATA_L1f,
+ ALLOC_L3_L1f,
+ ALLOC_L2_L1f,
+ WRITE_L1_L1f,
+
+};
+
+enum radix_offsets {
+ L1 = 0,
+ L2 = 1,
+ L3 = 2
+};
+
+
+static void read_cb(struct io_ret ret, void *param);
+static void write_cb(struct io_ret ret, void *param);
+
+int vdi_read(vdi_t *vdi, u64 vaddr, io_cb_t cb, void *param)
+{
+ struct io_req *req;
+
+ if (!VALID_VADDR(vaddr)) return ERR_BAD_VADDR;
+ /* Every second line in the bottom-level radix tree is used to */
+ /* store crc32 values etc. We shift the vadder here to achied this. */
+ vaddr <<= 1;
+
+ req = (struct io_req *)malloc(sizeof (struct io_req));
+ if (req == NULL) return ERR_NOMEM;
+
+ req->radix[0] = req->radix[1] = req->radix[2] = NULL;
+ req->op = IO_OP_READ;
+ req->root = vdi->radix_root;
+ req->lock = vdi->radix_lock;
+ req->vaddr = vaddr;
+ req->cb = cb;
+ req->param = param;
+ req->state = READ_LOCKED;
+
+ block_rlock(req->lock, L1_IDX(vaddr), read_cb, req);
+
+ return 0;
+}
+
+
+int vdi_write(vdi_t *vdi, u64 vaddr, char *block,
+ io_cb_t cb, void *param)
+{
+ struct io_req *req;
+
+ if (!VALID_VADDR(vaddr)) return ERR_BAD_VADDR;
+ /* Every second line in the bottom-level radix tree is used to */
+ /* store crc32 values etc. We shift the vadder here to achied this. */
+ vaddr <<= 1;
+
+ req = (struct io_req *)malloc(sizeof (struct io_req));
+ if (req == NULL) return ERR_NOMEM;
+
+ req->radix[0] = req->radix[1] = req->radix[2] = NULL;
+ req->op = IO_OP_WRITE;
+ req->root = vdi->radix_root;
+ req->lock = vdi->radix_lock;
+ req->vaddr = vaddr;
+ req->block = block;
+ /* Todo: add a pseodoheader to the block to include some location */
+ /* information in the CRC as well. */
+ req->bi.crc = (u32) crc32(0L, Z_NULL, 0);
+ req->bi.crc = (u32) crc32(req->bi.crc, block, BLOCK_SIZE);
+ req->bi.unused = 0xdeadbeef;
+
+ req->cb = cb;
+ req->param = param;
+ req->radix_addr[L1] = getid(req->root); /* for consistency */
+ req->state = WRITE_LOCKED;
+
+ block_wlock(req->lock, L1_IDX(vaddr), write_cb, req);
+
+
+ return 0;
+}
+
+static void read_cb(struct io_ret ret, void *param)
+{
+ struct io_req *req = (struct io_req *)param;
+ radix_tree_node node;
+ u64 idx;
+ char *block;
+ void *req_param;
+
+ DPRINTF("read_cb\n");
+ /* get record */
+ switch(req->state) {
+
+ case READ_LOCKED:
+
+ DPRINTF("READ_LOCKED\n");
+ req->state = READ_L1;
+ block_read(getid(req->root), read_cb, req);
+ break;
+
+ case READ_L1: /* block is the radix root */
+
+ DPRINTF("READ_L1\n");
+ block = IO_BLOCK(ret);
+ if (block == NULL) goto fail;
+ node = (radix_tree_node) block;
+ idx = getid( node[L1_IDX(req->vaddr)] );
+ free(block);
+ if ( idx == ZERO ) {
+ req->state = RETURN_ZERO;
+ block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
+ } else {
+ req->state = READ_L2;
+ block_read(idx, read_cb, req);
+ }
+ break;
+
+ case READ_L2:
+
+ DPRINTF("READ_L2\n");
+ block = IO_BLOCK(ret);
+ if (block == NULL) goto fail;
+ node = (radix_tree_node) block;
+ idx = getid( node[L2_IDX(req->vaddr)] );
+ free(block);
+ if ( idx == ZERO ) {
+ req->state = RETURN_ZERO;
+ block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
+ } else {
+ req->state = READ_L3;
+ block_read(idx, read_cb, req);
+ }
+ break;
+
+ case READ_L3:
+ {
+ struct block_info *bi;
+
+ DPRINTF("READ_L3\n");
+ block = IO_BLOCK(ret);
+ if (block == NULL) goto fail;
+ node = (radix_tree_node) block;
+ idx = getid( node[L3_IDX(req->vaddr)] );
+ bi = (struct block_info *) &node[L3_IDX(req->vaddr) + 1];
+ req->bi = *bi;
+ free(block);
+ if ( idx == ZERO ) {
+ req->state = RETURN_ZERO;
+ block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
+ } else {
+ req->state = READ_DATA;
+ block_read(idx, read_cb, req);
+ }
+ break;
+ }
+ case READ_DATA:
+ {
+ u32 crc;
+
+ DPRINTF("READ_DATA\n");
+ block = IO_BLOCK(ret);
+ if (block == NULL) goto fail;
+
+ /* crc check */
+ crc = (u32) crc32(0L, Z_NULL, 0);
+ crc = (u32) crc32(crc, block, BLOCK_SIZE);
+ if (crc != req->bi.crc) {
+ /* TODO: add a retry loop here. */
+ /* Do this after the cache is added -- make sure to */
+ /* invalidate the bad page before reissuing the read. */
+
+ warn("Bad CRC on vaddr (%Lu:%d)\n", req->vaddr, req->bi.unused);
+#ifdef PRINT_BADCRC_PAGES
+ {
+ int j;
+ for (j=0; j<BLOCK_SIZE; j++) {
+ if isprint(block[j]) {
+ printf("%c", block[j]);
+ } else {
+ printf(".");
+ }
+ if ((j % 64) == 0) printf("\n");
+ }
+ }
+#endif /* PRINT_BADCRC_PAGES */
+
+ /* fast and loose for the moment. */
+ /* goto fail; */
+ }
+
+ req->retval = ret;
+ req->state = READ_UNLOCKED;
+ block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
+ break;
+ }
+ case READ_UNLOCKED:
+ {
+ struct io_ret r;
+ io_cb_t cb;
+ DPRINTF("READ_UNLOCKED\n");
+ req_param = req->param;
+ r = req->retval;
+ cb = req->cb;
+ free(req);
+ cb(r, req_param);
+ break;
+ }
+
+ case RETURN_ZERO:
+ {
+ struct io_ret r;
+ io_cb_t cb;
+ DPRINTF("RETURN_ZERO\n");
+ req_param = req->param;
+ cb = req->cb;
+ free(req);
+ r.type = IO_BLOCK_T;
+ r.u.b = newblock();
+ cb(r, req_param);
+ break;
+ }
+
+ default:
+ DPRINTF("*** Write: Bad state! (%d) ***\n", req->state);
+ goto fail;
+ }
+
+ return;
+
+ fail:
+ {
+ struct io_ret r;
+ io_cb_t cb;
+ DPRINTF("asyn_read had a read error.\n");
+ req_param = req->param;
+ r = ret;
+ cb = req->cb;
+ free(req);
+ cb(r, req_param);
+ }
+
+
+}
+
+static void write_cb(struct io_ret r, void *param)
+{
+ struct io_req *req = (struct io_req *)param;
+ radix_tree_node node;
+ u64 a, addr;
+ void *req_param;
+ struct block_info *bi;
+
+ switch(req->state) {
+
+ case WRITE_LOCKED:
+
+ DPRINTF("WRITE_LOCKED (%llu)\n", L1_IDX(req->vaddr));
+ req->state = READ_L1;
+ block_read(getid(req->root), write_cb, req);
+ break;
+
+ case READ_L1: /* block is the radix root */
+
+ DPRINTF("READ_L1\n");
+ node = (radix_tree_node) IO_BLOCK(r);
+ if (node == NULL) goto fail;
+ a = node[L1_IDX(req->vaddr)];
+ addr = getid(a);
+
+ req->radix_addr[L2] = addr;
+ req->radix[L1] = node;
+
+ if ( addr == ZERO ) {
+ /* L1 empty subtree: */
+ req->state = ALLOC_DATA_L1z;
+ block_alloc( req->block, write_cb, req );
+ } else if ( !iswritable(a) ) {
+ /* L1 fault: */
+ req->state = READ_L2_L1f;
+ block_read( addr, write_cb, req );
+ } else {
+ req->state = READ_L2;
+ block_read( addr, write_cb, req );
+ }
+ break;
+
+ case READ_L2:
+
+ DPRINTF("READ_L2\n");
+ node = (radix_tree_node) IO_BLOCK(r);
+ if (node == NULL) goto fail;
+ a = node[L2_IDX(req->vaddr)];
+ addr = getid(a);
+
+ req->radix_addr[L3] = addr;
+ req->radix[L2] = node;
+
+ if ( addr == ZERO ) {
+ /* L2 empty subtree: */
+ req->state = ALLOC_DATA_L2z;
+ block_alloc( req->block, write_cb, req );
+ } else if ( !iswritable(a) ) {
+ /* L2 fault: */
+ req->state = READ_L3_L2f;
+ block_read( addr, write_cb, req );
+ } else {
+ req->state = READ_L3;
+ block_read( addr, write_cb, req );
+ }
+ break;
+
+ case READ_L3:
+
+ DPRINTF("READ_L3\n");
+ node = (radix_tree_node) IO_BLOCK(r);
+ if (node == NULL) goto fail;
+ a = node[L3_IDX(req->vaddr)];
+ addr = getid(a);
+
+ req->radix[L3] = node;
+
+ if ( addr == ZERO ) {
+ /* L3 fault: */
+ req->state = ALLOC_DATA_L3z;
+ block_alloc( req->block, write_cb, req );
+ } else if ( !iswritable(a) ) {
+ /* L3 fault: */
+ req->state = ALLOC_DATA_L3f;
+ block_alloc( req->block, write_cb, req );
+ } else {
+ req->state = WRITE_DATA;
+ block_write( addr, req->block, write_cb, req );
+ }
+ break;
+
+ case WRITE_DATA:
+
+ DPRINTF("WRITE_DATA\n");
+ /* The L3 radix points to the correct block, we just need to */
+ /* update the crc. */
+ if (IO_INT(r) < 0) goto fail;
+ bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
+ req->bi.unused = 101;
+ *bi = req->bi;
+ req->state = WRITE_L3;
+ block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req);
+ break;
+
+ /* L3 Zero Path: */
+
+ case ALLOC_DATA_L3z:
+
+ DPRINTF("ALLOC_DATA_L3z\n");
+ addr = IO_ADDR(r);
+ a = writable(addr);
+ req->radix[L3][L3_IDX(req->vaddr)] = a;
+ bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
+ req->bi.unused = 102;
+ *bi = req->bi;
+ req->state = WRITE_L3_L3z;
+ block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req);
+ break;
+
+ /* L3 Fault Path: */
+
+ case ALLOC_DATA_L3f:
+
+ DPRINTF("ALLOC_DATA_L3f\n");
+ addr = IO_ADDR(r);
+ a = writable(addr);
+ req->radix[L3][L3_IDX(req->vaddr)] = a;
+ bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
+ req->bi.unused = 103;
+ *bi = req->bi;
+ req->state = WRITE_L3_L3f;
+ block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req);
+ break;
+
+ /* L2 Zero Path: */
+
+ case ALLOC_DATA_L2z:
+
+ DPRINTF("ALLOC_DATA_L2z\n");
+ addr = IO_ADDR(r);
+ a = writable(addr);
+ req->radix[L3] = newblock();
+ req->radix[L3][L3_IDX(req->vaddr)] = a;
+ bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
+ req->bi.unused = 104;
+ *bi = req->bi;
+ req->state = ALLOC_L3_L2z;
+ block_alloc( (char*)req->radix[L3], write_cb, req );
+ break;
+
+ case ALLOC_L3_L2z:
+
+ DPRINTF("ALLOC_L3_L2z\n");
+ addr = IO_ADDR(r);
+ a = writable(addr);
+ req->radix[L2][L2_IDX(req->vaddr)] = a;
+ req->state = WRITE_L2_L2z;
+ block_write(req->radix_addr[L2], (char*)req->radix[L2], write_cb, req);
+ break;
+
+ /* L2 Fault Path: */
+
+ case READ_L3_L2f:
+
+ DPRINTF("READ_L3_L2f\n");
+ node = (radix_tree_node) IO_BLOCK(r);
+ clear_L3_w_bits(node);
+ if (node == NULL) goto fail;
+ a = node[L2_IDX(req->vaddr)];
+ addr = getid(a);
+
+ req->radix[L3] = node;
+ req->state = ALLOC_DATA_L2f;
+ block_alloc( req->block, write_cb, req );
+ break;
+
+ case ALLOC_DATA_L2f:
+
+ DPRINTF("ALLOC_DATA_L2f\n");
+ addr = IO_ADDR(r);
+ a = writable(addr);
+ req->radix[L3][L3_IDX(req->vaddr)] = a;
+ bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
+ req->bi.unused = 105;
+ *bi = req->bi;
+ req->state = ALLOC_L3_L2f;
+ block_alloc( (char*)req->radix[L3], write_cb, req );
+ break;
+
+ case ALLOC_L3_L2f:
+
+ DPRINTF("ALLOC_L3_L2f\n");
+ addr = IO_ADDR(r);
+ a = writable(addr);
+ req->radix[L2][L2_IDX(req->vaddr)] = a;
+ req->state = WRITE_L2_L2f;
+ block_write(req->radix_addr[L2], (char*)req->radix[L2], write_cb, req);
+ break;
+
+ /* L1 Zero Path: */
+
+ case ALLOC_DATA_L1z:
+
+ DPRINTF("ALLOC_DATA_L1z\n");
+ addr = IO_ADDR(r);
+ a = writable(addr);
+ req->radix[L3] = newblock();
+ req->radix[L3][L3_IDX(req->vaddr)] = a;
+ bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
+ req->bi.unused = 106;
+ *bi = req->bi;
+ req->state = ALLOC_L3_L1z;
+ block_alloc( (char*)req->radix[L3], write_cb, req );
+ break;
+
+ case ALLOC_L3_L1z:
+
+ DPRINTF("ALLOC_L3_L1z\n");
+ addr = IO_ADDR(r);
+ a = writable(addr);
+ req->radix[L2] = newblock();
+ req->radix[L2][L2_IDX(req->vaddr)] = a;
+ req->state = ALLOC_L2_L1z;
+ block_alloc( (char*)req->radix[L2], write_cb, req );
+ break;
+
+ case ALLOC_L2_L1z:
+
+ DPRINTF("ALLOC_L2_L1z\n");
+ addr = IO_ADDR(r);
+ a = writable(addr);
+ req->radix[L1][L1_IDX(req->vaddr)] = a;
+ req->state = WRITE_L1_L1z;
+ block_write(req->radix_addr[L1], (char*)req->radix[L1], write_cb, req);
+ break;
+
+ /* L1 Fault Path: */
+
+ case READ_L2_L1f:
+
+ DPRINTF("READ_L2_L1f\n");
+ node = (radix_tree_node) IO_BLOCK(r);
+ clear_w_bits(node);
+ if (node == NULL) goto fail;
+ a = node[L2_IDX(req->vaddr)];
+ addr = getid(a);
+
+ req->radix_addr[L3] = addr;
+ req->radix[L2] = node;
+
+ if (addr == ZERO) {
+ /* nothing below L2, create an empty L3 and alloc data. */
+ /* (So skip READ_L3_L1f.) */
+ req->radix[L3] = newblock();
+ req->state = ALLOC_DATA_L1f;
+ block_alloc( req->block, write_cb, req );
+ } else {
+ req->state = READ_L3_L1f;
+ block_read( addr, write_cb, req );
+ }
+ break;
+
+ case READ_L3_L1f:
+
+ DPRINTF("READ_L3_L1f\n");
+ node = (radix_tree_node) IO_BLOCK(r);
+ clear_L3_w_bits(node);
+ if (node == NULL) goto fail;
+ a = node[L2_IDX(req->vaddr)];
+ addr = getid(a);
+
+ req->radix[L3] = node;
+ req->state = ALLOC_DATA_L1f;
+ block_alloc( req->block, write_cb, req );
+ break;
+
+ case ALLOC_DATA_L1f:
+
+ DPRINTF("ALLOC_DATA_L1f\n");
+ addr = IO_ADDR(r);
+ a = writable(addr);
+ req->radix[L3][L3_IDX(req->vaddr)] = a;
+ bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
+ req->bi.unused = 107;
+ *bi = req->bi;
+ req->state = ALLOC_L3_L1f;
+ block_alloc( (char*)req->radix[L3], write_cb, req );
+ break;
+
+ case ALLOC_L3_L1f:
+
+ DPRINTF("ALLOC_L3_L1f\n");
+ addr = IO_ADDR(r);
+ a = writable(addr);
+ req->radix[L2][L2_IDX(req->vaddr)] = a;
+ req->state = ALLOC_L2_L1f;
+ block_alloc( (char*)req->radix[L2], write_cb, req );
+ break;
+
+ case ALLOC_L2_L1f:
+
+ DPRINTF("ALLOC_L2_L1f\n");
+ addr = IO_ADDR(r);
+ a = writable(addr);
+ req->radix[L1][L1_IDX(req->vaddr)] = a;
+ req->state = WRITE_L1_L1f;
+ block_write(req->radix_addr[L1], (char*)req->radix[L1], write_cb, req);
+ break;
+
+ case WRITE_L3:
+ case WRITE_L3_L3z:
+ case WRITE_L3_L3f:
+ case WRITE_L2_L2z:
+ case WRITE_L2_L2f:
+ case WRITE_L1_L1z:
+ case WRITE_L1_L1f:
+ {
+ int i;
+ DPRINTF("DONE\n");
+ /* free any saved node vals. */
+ for (i=0; i<3; i++)
+ if (req->radix[i] != 0) free(req->radix[i]);
+ req->retval = r;
+ req->state = WRITE_UNLOCKED;
+ block_wunlock(req->lock, L1_IDX(req->vaddr), write_cb, req);
+ break;
+ }
+ case WRITE_UNLOCKED:
+ {
+ struct io_ret r;
+ io_cb_t cb;
+ DPRINTF("WRITE_UNLOCKED!\n");
+ req_param = req->param;
+ r = req->retval;
+ cb = req->cb;
+ free(req);
+ cb(r, req_param);
+ break;
+ }
+
+ default:
+ DPRINTF("*** Write: Bad state! (%d) ***\n", req->state);
+ goto fail;
+ }
+
+ return;
+
+ fail:
+ {
+ struct io_ret r;
+ io_cb_t cb;
+ int i;
+
+ DPRINTF("asyn_write had a read error mid-way.\n");
+ req_param = req->param;
+ cb = req->cb;
+ r.type = IO_INT_T;
+ r.u.i = -1;
+ /* free any saved node vals. */
+ for (i=0; i<3; i++)
+ if (req->radix[i] != 0) free(req->radix[i]);
+ free(req);
+ cb(r, req_param);
+ }
+}
+
+char *vdi_read_s(vdi_t *vdi, u64 vaddr)
+{
+ pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
+ char *block = NULL;
+ int ret;
+
+ void reads_cb(struct io_ret r, void *param)
+ {
+ block = IO_BLOCK(r);
+ pthread_mutex_unlock((pthread_mutex_t *)param);
+ }
+
+ pthread_mutex_lock(&m);
+ ret = vdi_read(vdi, vaddr, reads_cb, &m);
+
+ if (ret == 0) pthread_mutex_lock(&m);
+
+ return block;
+}
+
+
+int vdi_write_s(vdi_t *vdi, u64 vaddr, char *block)
+{
+ pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
+ int ret, result;
+
+ void writes_cb(struct io_ret r, void *param)
+ {
+ result = IO_INT(r);
+ pthread_mutex_unlock((pthread_mutex_t *)param);
+ }
+
+ pthread_mutex_lock(&m);
+ ret = vdi_write(vdi, vaddr, block, writes_cb, &m);
+
+ if (ret == 0) pthread_mutex_lock(&m);
+
+ return result;
+}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/requests-async.h
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/requests-async.h Sun Jul 3 14:14:09 2005
@@ -0,0 +1,29 @@
+#ifndef _REQUESTSASYNC_H_
+#define _REQUESTSASYNC_H_
+
+#include "block-async.h"
+#include "blockstore.h" /* for newblock etc. */
+
+/*
+#define BLOCK_SIZE 4096
+#define ZERO 0ULL
+#define getid(x) (((x)>>1)&0x7fffffffffffffffLLU)
+#define iswritable(x) (((x) & 1LLU) != 0)
+#define writable(x) (((x) << 1) | 1LLU)
+#define readonly(x) ((u64)((x) << 1))
+*/
+
+#define VADDR_MASK 0x0000000003ffffffLLU /* 26-bits = 256Gig */
+#define VALID_VADDR(x) (((x) & VADDR_MASK) == (x))
+
+int vdi_read (vdi_t *vdi, u64 vaddr, io_cb_t cb, void *param);
+int vdi_write(vdi_t *vdi, u64 vaddr, char *block, io_cb_t cb, void *param);
+
+/* synchronous versions: */
+char *vdi_read_s (vdi_t *vdi, u64 vaddr);
+int vdi_write_s(vdi_t *vdi, u64 vaddr, char *block);
+
+#define ERR_BAD_VADDR -1
+#define ERR_NOMEM -2
+
+#endif //_REQUESTSASYNC_H_
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/vdi_unittest.c
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/vdi_unittest.c Sun Jul 3 14:14:09 2005
@@ -0,0 +1,184 @@
+/**************************************************************************
+ *
+ * vdi_unittest.c
+ *
+ * Run a small test workload to ensure that data access through a vdi
+ * is (at least superficially) correct.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "requests-async.h"
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+#define TEST_PAGES 32
+static char *zero_page;
+static char pages[TEST_PAGES][BLOCK_SIZE];
+static int next_page = 0;
+
+void fill_test_pages(void)
+{
+ int i, j;
+ long *page;
+
+ for (i=0; i< TEST_PAGES; i++) {
+ page = (unsigned long *)pages[i];
+ for (j=0; j<(BLOCK_SIZE/4); j++) {
+ page[j] = random();
+ }
+ }
+
+ zero_page = newblock();
+}
+
+inline u64 make_vaddr(u64 L1, u64 L2, u64 L3)
+{
+ u64 ret = L1;
+
+ ret = (ret << 9) | L2;
+ ret = (ret << 9) | L3;
+
+ return ret;
+}
+
+void touch_block(vdi_t *vdi, u64 L1, u64 L2, u64 L3)
+{
+ u64 vaddr;
+ char *page = pages[next_page++];
+ char *rpage = NULL;
+
+ printf("TOUCH (%3Lu, %3Lu, %3Lu)\n", L1, L2, L3);
+
+ vaddr = make_vaddr(L1, L2, L3);
+ vdi_write_s(vdi, vaddr, page);
+ rpage = vdi_read_s(vdi, vaddr);
+
+ if (rpage == NULL)
+ {
+ printf( "read %Lu returned NULL\n", vaddr);
+ return;
+ }
+
+ if (memcmp(page, rpage, BLOCK_SIZE) != 0)
+ {
+ printf( "read %Lu returned a different page\n", vaddr);
+ return;
+ }
+
+ freeblock(rpage);
+}
+
+void test_block(vdi_t *vdi, u64 L1, u64 L2, u64 L3, char *page)
+{
+ u64 vaddr;
+ char *rpage = NULL;
+
+ printf("TEST (%3Lu, %3Lu, %3Lu)\n", L1, L2, L3);
+
+ vaddr = make_vaddr(L1, L2, L3);
+ rpage = vdi_read_s(vdi, vaddr);
+
+ if (rpage == NULL)
+ {
+ printf( "read %Lu returned NULL\n", vaddr);
+ return;
+ }
+
+ if (memcmp(page, rpage, BLOCK_SIZE) != 0)
+ {
+ printf( "read %Lu returned a different page\n", vaddr);
+ return;
+ }
+
+ freeblock(rpage);
+}
+
+void coverage_test(vdi_t *vdi)
+{
+ u64 vaddr;
+ int i, j, k;
+
+ /* Do a series of writes and reads to test all paths through the
+ * async radix code. The radix request code will dump CRC warnings
+ * if there are data problems here as well.
+ */
+
+ /* L1 Zero */
+ touch_block(vdi, 0, 0, 0);
+
+ /* L2 Zero */
+ i = next_page;
+ touch_block(vdi, 0, 1, 0);
+
+ /* L3 Zero */
+ j = next_page;
+ touch_block(vdi, 0, 0, 1);
+ k = next_page;
+ touch_block(vdi, 0, 1, 1);
+
+ /* Direct write */
+ touch_block(vdi, 0, 0, 0);
+
+ vdi_snapshot(vdi);
+
+ /* L1 fault */
+ touch_block(vdi, 0, 0, 0);
+ /* test the read-only branches that should have been copied over. */
+ test_block(vdi, 0, 1, 0, pages[i]);
+ test_block(vdi, 0, 0, 1, pages[j]);
+
+ /* L2 fault */
+ touch_block(vdi, 0, 1, 0);
+ test_block(vdi, 0, 1, 1, pages[k]);
+
+ /* L3 fault */
+ touch_block(vdi, 0, 0, 1);
+
+ /* read - L1 zero */
+ test_block(vdi, 1, 0, 0, zero_page);
+
+ /* read - L2 zero */
+ test_block(vdi, 0, 2, 0, zero_page);
+
+ /* read - L3 zero */
+ test_block(vdi, 0, 0, 2, zero_page);
+}
+
+int main(int argc, char *argv[])
+{
+ vdi_t *vdi;
+ u64 id;
+ int fd;
+ struct stat st;
+ u64 tot_size;
+ char spage[BLOCK_SIZE];
+ char *dpage;
+ u64 vblock = 0, count=0;
+
+ __init_blockstore();
+ init_block_async();
+ __init_vdi();
+
+ vdi = vdi_create( NULL, "UNIT TEST VDI");
+
+ if ( vdi == NULL ) {
+ printf("Failed to create VDI!\n");
+ freeblock(vdi);
+ exit(-1);
+ }
+
+ fill_test_pages();
+ coverage_test(vdi);
+
+ freeblock(vdi);
+
+ return (0);
+}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/block-async.h
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/block-async.h Sun Jul 3 14:14:09 2005
@@ -0,0 +1,69 @@
+/* block-async.h
+ *
+ * Asynchronous block wrappers for parallax.
+ */
+
+#ifndef _BLOCKASYNC_H_
+#define _BLOCKASYNC_H_
+
+#include <assert.h>
+#include <xc.h>
+#include "vdi.h"
+
+struct io_ret
+{
+ enum {IO_ADDR_T, IO_BLOCK_T, IO_INT_T} type;
+ union {
+ u64 a;
+ char *b;
+ int i;
+ } u;
+};
+
+typedef void (*io_cb_t)(struct io_ret r, void *param);
+
+/* per-vdi lock structures to make sure requests run in a safe order. */
+struct radix_wait {
+ enum {RLOCK, WLOCK} type;
+ io_cb_t cb;
+ void *param;
+ struct radix_wait *next;
+};
+
+struct radix_lock {
+ pthread_mutex_t lock;
+ int lines[1024];
+ struct radix_wait *waiters[1024];
+ enum {ANY, READ, STOP} state[1024];
+};
+void radix_lock_init(struct radix_lock *r);
+
+void block_read(u64 addr, io_cb_t cb, void *param);
+void block_write(u64 addr, char *block, io_cb_t cb, void *param);
+void block_alloc(char *block, io_cb_t cb, void *param);
+void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
+void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
+void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
+void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
+void init_block_async(void);
+
+static inline u64 IO_ADDR(struct io_ret r)
+{
+ assert(r.type == IO_ADDR_T);
+ return r.u.a;
+}
+
+static inline char *IO_BLOCK(struct io_ret r)
+{
+ assert(r.type == IO_BLOCK_T);
+ return r.u.b;
+}
+
+static inline int IO_INT(struct io_ret r)
+{
+ assert(r.type == IO_INT_T);
+ return r.u.i;
+}
+
+
+#endif //_BLOCKASYNC_H_
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/vdi_snap.c
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/vdi_snap.c Sun Jul 3 14:14:09 2005
@@ -0,0 +1,43 @@
+/**************************************************************************
+ *
+ * vdi_snap.c
+ *
+ * Snapshot a vdi.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+int main(int argc, char *argv[])
+{
+ vdi_t *vdi;
+ u64 id;
+
+ __init_blockstore();
+ __init_vdi();
+
+ if ( argc == 1 ) {
+ printf("usage: %s <VDI id>\n", argv[0]);
+ exit(-1);
+ }
+
+ id = (u64) atoll(argv[1]);
+
+ vdi = vdi_get(id);
+
+ if ( vdi == NULL ) {
+ printf("couldn't find the requested VDI.\n");
+ freeblock(vdi);
+ exit(-1);
+ }
+
+ vdi_snapshot(vdi);
+
+ return 0;
+}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/vdi_create.c
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/vdi_create.c Sun Jul 3 14:14:09 2005
@@ -0,0 +1,52 @@
+/**************************************************************************
+ *
+ * vdi_create.c
+ *
+ * Create a new vdi.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+int main(int argc, char *argv[])
+{
+ vdi_t *vdi;
+ char name[VDI_NAME_SZ] = "";
+ snap_id_t id;
+ int from_snap = 0;
+
+ __init_blockstore();
+ __init_vdi();
+
+ if ( argc == 1 ) {
+ printf("usage: %s <VDI Name> [<snap block> <snap idx>]\n", argv[0]);
+ exit(-1);
+ }
+
+ strncpy( name, argv[1], VDI_NAME_SZ);
+ name[VDI_NAME_SZ] = '\0';
+
+ if ( argc > 3 ) {
+ id.block = (u64) atoll(argv[2]);
+ id.index = (unsigned int) atol (argv[3]);
+ from_snap = 1;
+ }
+
+ vdi = vdi_create( from_snap ? &id : NULL, name);
+
+ if ( vdi == NULL ) {
+ printf("Failed to create VDI!\n");
+ freeblock(vdi);
+ exit(-1);
+ }
+
+ freeblock(vdi);
+
+ return (0);
+}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/vdi_validate.c
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/vdi_validate.c Sun Jul 3 14:14:09 2005
@@ -0,0 +1,97 @@
+/**************************************************************************
+ *
+ * vdi_validate.c
+ *
+ * Intended to sanity-check vm_fill and the underlying vdi code.
+ *
+ * Block-by-block compare of a vdi with a file/device on the disk.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+#include "requests-async.h"
+
+int main(int argc, char *argv[])
+{
+ vdi_t *vdi;
+ u64 id;
+ int fd;
+ struct stat st;
+ u64 tot_size;
+ char spage[BLOCK_SIZE], *dpage;
+ char *vpage;
+ u64 vblock = 0, count=0;
+
+ __init_blockstore();
+ init_block_async();
+ __init_vdi();
+
+ if ( argc < 3 ) {
+ printf("usage: %s <VDI id> <filename>\n", argv[0]);
+ exit(-1);
+ }
+
+ id = (u64) atoll(argv[1]);
+
+ vdi = vdi_get( id );
+
+ if ( vdi == NULL ) {
+ printf("Failed to retreive VDI %Ld!\n", id);
+ exit(-1);
+ }
+
+ fd = open(argv[2], O_RDONLY | O_LARGEFILE);
+
+ if (fd < 0) {
+ printf("Couldn't open %s!\n", argv[2]);
+ exit(-1);
+ }
+
+ if ( fstat(fd, &st) != 0 ) {
+ printf("Couldn't stat %s!\n", argv[2]);
+ exit(-1);
+ }
+
+ tot_size = (u64) st.st_size;
+ printf("Testing VDI %Ld (%Ld bytes).\n", id, tot_size);
+
+ printf(" ");
+ while ( ( count = read(fd, spage, BLOCK_SIZE) ) > 0 ) {
+
+ dpage = vdi_read_s(vdi, vblock);
+
+ if (dpage == NULL) {
+ printf("\n\nfound an unmapped VDI block (%Ld)\n", vblock);
+ exit(0);
+ }
+
+ if (memcmp(spage, dpage, BLOCK_SIZE) != 0) {
+ printf("\n\nblocks don't match! (%Ld)\n", vblock);
+ exit(0);
+ }
+
+ freeblock(dpage);
+
+ vblock++;
+ if ((vblock % 1024) == 0) {
+ printf("\b\b\b\b\b\b\b\b\b\b\b%011Ld", vblock);
+ fflush(stdout);
+ }
+ }
+ printf("\n");
+
+ printf("VDI %Ld looks good!\n", id);
+
+ freeblock(vdi);
+
+ return (0);
+}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/vdi_fill.c
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/vdi_fill.c Sun Jul 3 14:14:09 2005
@@ -0,0 +1,81 @@
+/**************************************************************************
+ *
+ * vdi_fill.c
+ *
+ * Hoover a file or device into a vdi.
+ * You must first create the vdi with vdi_create.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "requests-async.h"
+#include "vdi.h"
+
+int main(int argc, char *argv[])
+{
+ vdi_t *vdi;
+ u64 id;
+ int fd;
+ struct stat st;
+ u64 tot_size;
+ char spage[BLOCK_SIZE];
+ char *dpage;
+ u64 vblock = 0, count=0;
+
+ __init_blockstore();
+ init_block_async();
+ __init_vdi();
+
+ if ( argc < 3 ) {
+ printf("usage: %s <VDI id> <filename>\n", argv[0]);
+ exit(-1);
+ }
+
+ id = (u64) atoll(argv[1]);
+
+ vdi = vdi_get( id );
+
+ if ( vdi == NULL ) {
+ printf("Failed to retreive VDI %Ld!\n", id);
+ exit(-1);
+ }
+
+ fd = open(argv[2], O_RDONLY | O_LARGEFILE);
+
+ if (fd < 0) {
+ printf("Couldn't open %s!\n", argv[2]);
+ exit(-1);
+ }
+
+ if ( fstat(fd, &st) != 0 ) {
+ printf("Couldn't stat %s!\n", argv[2]);
+ exit(-1);
+ }
+
+ tot_size = (u64) st.st_size;
+ printf("Filling VDI %Ld with %Ld bytes.\n", id, tot_size);
+
+ printf("%011Ld blocks total\n", tot_size / BLOCK_SIZE);
+ printf(" ");
+ while ( ( count = read(fd, spage, BLOCK_SIZE) ) > 0 ) {
+ vdi_write_s(vdi, vblock, spage);
+
+ vblock++;
+ if ((vblock % 512) == 0)
+ printf("\b\b\b\b\b\b\b\b\b\b\b%011Ld", vblock);
+ fflush(stdout);
+ }
+ printf("\n");
+
+ freeblock(vdi);
+
+ return (0);
+}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/radix.c
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/radix.c Sun Jul 3 14:14:09 2005
@@ -0,0 +1,631 @@
+/*
+ * Radix tree for mapping (up to) 63-bit virtual block IDs to
+ * 63-bit global block IDs
+ *
+ * Pointers within the tree set aside the least significant bit to indicate
+ * whther or not the target block is writable from this node.
+ *
+ * The block with ID 0 is assumed to be an empty block of all zeros
+ */
+
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <pthread.h>
+#include "blockstore.h"
+#include "radix.h"
+
+#define RADIX_TREE_MAP_SHIFT 9
+#define RADIX_TREE_MAP_MASK 0x1ff
+#define RADIX_TREE_MAP_ENTRIES 512
+
+/*
+#define DEBUG
+*/
+
+/* Experimental radix cache. */
+
+static pthread_mutex_t rcache_mutex = PTHREAD_MUTEX_INITIALIZER;
+static int rcache_count = 0;
+#define RCACHE_MAX 1024
+
+typedef struct rcache_st {
+ radix_tree_node *node;
+ u64 id;
+ struct rcache_st *hash_next;
+ struct rcache_st *cache_next;
+ struct rcache_st *cache_prev;
+} rcache_t;
+
+static rcache_t *rcache_head = NULL;
+static rcache_t *rcache_tail = NULL;
+
+#define RCHASH_SIZE 512ULL
+rcache_t *rcache[RCHASH_SIZE];
+#define RCACHE_HASH(_id) ((_id) & (RCHASH_SIZE - 1))
+
+void __rcache_init(void)
+{
+ int i;
+
+ for (i=0; i<RCHASH_SIZE; i++)
+ rcache[i] = NULL;
+}
+
+
+void rcache_write(u64 id, radix_tree_node *node)
+{
+ rcache_t *r, *tmp, **curs;
+
+ pthread_mutex_lock(&rcache_mutex);
+
+ /* Is it already in the cache? */
+ r = rcache[RCACHE_HASH(id)];
+
+ for (;;) {
+ if (r == NULL)
+ break;
+ if (r->id == id)
+ {
+ memcpy(r->node, node, BLOCK_SIZE);
+
+ /* bring to front. */
+ if (r != rcache_head) {
+
+ if (r == rcache_tail) {
+ if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
+ rcache_tail->cache_next = NULL;
+ }
+
+ tmp = r->cache_next;
+ if (r->cache_next != NULL) r->cache_next->cache_prev
+ = r->cache_prev;
+ if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp;
+
+ r->cache_prev = NULL;
+ r->cache_next = rcache_head;
+ if (rcache_head != NULL) rcache_head->cache_prev = r;
+ rcache_head = r;
+ }
+
+//printf("Update (%Ld)\n", r->id);
+ goto done;
+ }
+ r = r->hash_next;
+ }
+
+ if ( rcache_count == RCACHE_MAX )
+ {
+ /* Remove an entry */
+
+ r = rcache_tail;
+ if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
+ rcache_tail->cache_next = NULL;
+ freeblock(r->node);
+
+ curs = &rcache[RCACHE_HASH(r->id)];
+ while ((*curs) != r)
+ curs = &(*curs)->hash_next;
+ *curs = r->hash_next;
+//printf("Evict (%Ld)\n", r->id);
+
+ } else {
+
+ r = (rcache_t *)malloc(sizeof(rcache_t));
+ rcache_count++;
+ }
+
+ r->node = newblock();
+ memcpy(r->node, node, BLOCK_SIZE);
+ r->id = id;
+
+ r->hash_next = rcache[RCACHE_HASH(id)];
+ rcache[RCACHE_HASH(id)] = r;
+
+ r->cache_prev = NULL;
+ r->cache_next = rcache_head;
+ if (rcache_head != NULL) rcache_head->cache_prev = r;
+ rcache_head = r;
+ if (rcache_tail == NULL) rcache_tail = r;
+
+//printf("Added (%Ld, %p)\n", id, r->node);
+done:
+ pthread_mutex_unlock(&rcache_mutex);
+}
+
+radix_tree_node *rcache_read(u64 id)
+{
+ rcache_t *r, *tmp;
+ radix_tree_node *node = NULL;
+
+ pthread_mutex_lock(&rcache_mutex);
+
+ r = rcache[RCACHE_HASH(id)];
+
+ for (;;) {
+ if (r == NULL) {
+//printf("Miss (%Ld)\n", id);
+ goto done;
+ }
+ if (r->id == id) break;
+ r = r->hash_next;
+ }
+
+ /* bring to front. */
+ if (r != rcache_head)
+ {
+ if (r == rcache_tail) {
+ if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
+ rcache_tail->cache_next = NULL;
+ }
+ tmp = r->cache_next;
+ if (r->cache_next != NULL) r->cache_next->cache_prev = r->cache_prev;
+ if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp;
+
+ r->cache_prev = NULL;
+ r->cache_next = rcache_head;
+ if (rcache_head != NULL) rcache_head->cache_prev = r;
+ rcache_head = r;
+ }
+
+ node = newblock();
+ memcpy(node, r->node, BLOCK_SIZE);
+
+//printf("Hit (%Ld, %p)\n", id, r->node);
+done:
+ pthread_mutex_unlock(&rcache_mutex);
+
+ return(node);
+}
+
+
+void *rc_readblock(u64 id)
+{
+ void *ret;
+
+ ret = (void *)rcache_read(id);
+
+ if (ret != NULL) return ret;
+
+ ret = readblock(id);
+
+ if (ret != NULL)
+ rcache_write(id, ret);
+
+ return(ret);
+}
+
+u64 rc_allocblock(void *block)
+{
+ u64 ret;
+
+ ret = allocblock(block);
+
+ if (ret != ZERO)
+ rcache_write(ret, block);
+
+ return(ret);
+}
+
+int rc_writeblock(u64 id, void *block)
+{
+ int ret;
+
+ ret = writeblock(id, block);
+ rcache_write(id, block);
+
+ return(ret);
+}
+
+
+/*
+ * block device interface and other helper functions
+ * with these functions, block id is just a 63-bit number, with
+ * no special consideration for the LSB
+ */
+radix_tree_node cloneblock(radix_tree_node block);
+
+/*
+ * main api
+ * with these functions, the LSB of root always indicates
+ * whether or not the block is writable, including the return
+ * values of update and snapshot
+ */
+u64 lookup(int height, u64 root, u64 key);
+u64 update(int height, u64 root, u64 key, u64 val);
+u64 snapshot(u64 root);
+
+/**
+ * cloneblock: clone an existing block in memory
+ * @block: the old block
+ *
+ * @return: new block, with LSB cleared for every entry
+ */
+radix_tree_node cloneblock(radix_tree_node block) {
+ radix_tree_node node = (radix_tree_node) malloc(BLOCK_SIZE);
+ int i;
+ if (node == NULL) {
+ perror("cloneblock malloc");
+ return NULL;
+ }
+ for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++)
+ node[i] = block[i] & ONEMASK;
+ return node;
+}
+
+/**
+ * lookup: find a value given a key
+ * @height: height in bits of the radix tree
+ * @root: root node id, with set LSB indicating writable node
+ * @key: key to lookup
+ *
+ * @return: value on success, zero on error
+ */
+
+u64 lookup(int height, u64 root, u64 key) {
+ radix_tree_node node;
+ u64 mask = ONE;
+
+ assert(key >> height == 0);
+
+ /* the root block may be smaller to ensure all leaves are full */
+ height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
+
+ /* now carve off equal sized chunks at each step */
+ for (;;) {
+ u64 oldroot;
+
+#ifdef DEBUG
+ printf("lookup: height=%3d root=%3Ld offset=%3d%s\n", height, root,
+ (int) ((key >> height) & RADIX_TREE_MAP_MASK),
+ (iswritable(root) ? "" : " (readonly)"));
+#endif
+
+ if (getid(root) == ZERO)
+ return ZERO;
+
+ oldroot = root;
+ node = (radix_tree_node) rc_readblock(getid(root));
+ if (node == NULL)
+ return ZERO;
+
+ root = node[(key >> height) & RADIX_TREE_MAP_MASK];
+ mask &= root;
+ freeblock(node);
+
+ if (height == 0)
+ return ( root & ONEMASK ) | mask;
+
+ height -= RADIX_TREE_MAP_SHIFT;
+ }
+
+ return ZERO;
+}
+
+/*
+ * update: set a radix tree entry, doing copy-on-write as necessary
+ * @height: height in bits of the radix tree
+ * @root: root node id, with set LSB indicating writable node
+ * @key: key to set
+ * @val: value to set, s.t. radix(key)=val
+ *
+ * @returns: (possibly new) root id on success (with LSB=1), 0 on failure
+ */
+
+u64 update(int height, u64 root, u64 key, u64 val) {
+ int offset;
+ u64 child;
+ radix_tree_node node;
+
+ /* base case--return val */
+ if (height == 0)
+ return val;
+
+ /* the root block may be smaller to ensure all leaves are full */
+ height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
+ offset = (key >> height) & RADIX_TREE_MAP_MASK;
+
+#ifdef DEBUG
+ printf("update: height=%3d root=%3Ld offset=%3d%s\n", height, root,
+ offset, (iswritable(root)?"":" (clone)"));
+#endif
+
+ /* load a block, or create a new one */
+ if (root == ZERO) {
+ node = (radix_tree_node) newblock();
+ } else {
+ node = (radix_tree_node) rc_readblock(getid(root));
+
+ if (!iswritable(root)) {
+ /* need to clone this node */
+ radix_tree_node oldnode = node;
+ node = cloneblock(node);
+ freeblock(oldnode);
+ root = ZERO;
+ }
+ }
+
+ if (node == NULL) {
+#ifdef DEBUG
+ printf("update: node is null!\n");
+#endif
+ return ZERO;
+ }
+
+ child = update(height, node[offset], key, val);
+
+ if (child == ZERO) {
+ freeblock(node);
+ return ZERO;
+ } else if (child == node[offset]) {
+ /* no change, so we already owned the child */
+ assert(iswritable(root));
+
+ freeblock(node);
+ return root;
+ }
+
+ node[offset] = child;
+
+ /* new/cloned blocks need to be saved */
+ if (root == ZERO) {
+ /* mark this as an owned block */
+ root = rc_allocblock(node);
+ if (root)
+ root = writable(root);
+ } else if (rc_writeblock(getid(root), node) < 0) {
+ freeblock(node);
+ return ZERO;
+ }
+
+ freeblock(node);
+ return root;
+}
+
+/**
+ * snapshot: create a snapshot
+ * @root: old root node
+ *
+ * @return: new root node, 0 on error
+ */
+u64 snapshot(u64 root) {
+ radix_tree_node node, newnode;
+
+ if ((node = rc_readblock(getid(root))) == NULL)
+ return ZERO;
+
+ newnode = cloneblock(node);
+ freeblock(node);
+ if (newnode == NULL)
+ return ZERO;
+
+ root = rc_allocblock(newnode);
+ freeblock(newnode);
+
+ if (root == ZERO)
+ return ZERO;
+ else
+ return writable(root);
+}
+
+/**
+ * collapse: collapse a parent onto a child.
+ *
+ * NOTE: This assumes that parent and child really are, and further that
+ * there are no other children forked from this parent. (children of the
+ * child are okay...)
+ */
+
+int collapse(int height, u64 proot, u64 croot)
+{
+ int i, numlinks, ret, total = 0;
+ radix_tree_node pnode, cnode;
+
+ if (height == 0) {
+ height = -1; /* terminate recursion */
+ } else {
+ height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
+ }
+ numlinks = (1UL << RADIX_TREE_MAP_SHIFT);
+
+ /* Terminal cases: */
+
+ if ( (getid(proot) == ZERO) || (getid(croot) == ZERO) )
+ return -1;
+
+ /* get roots */
+ if ((pnode = readblock(getid(proot))) == NULL)
+ return -1;
+
+ if ((cnode = readblock(getid(croot))) == NULL)
+ {
+ freeblock(pnode);
+ return -1;
+ }
+
+ /* For each writable link in proot */
+ for (i=0; i<numlinks; i++)
+ {
+ if ( pnode[i] == cnode[i] ) continue;
+
+ /* collapse (next level) */
+ /* if height != 0 and writable... */
+ if (( height >= 0 ) && ( iswritable(pnode[i]) ) )
+ {
+ //printf(" %Ld is writable (i=%d).\n", getid(pnode[i]), i);
+ ret = collapse(height, pnode[i], cnode[i]);
+ if (ret == -1)
+ {
+ total = -1;
+ } else {
+ total += ret;
+ }
+ }
+
+
+ }
+
+ /* if plink is writable, AND clink is writable -> free plink block */
+ if ( ( iswritable(proot) ) && ( iswritable(croot) ) )
+ {
+ releaseblock(getid(proot));
+ if (ret >=0) total++;
+ //printf(" Delete %Ld\n", getid(proot));
+ }
+//printf("done : %Ld\n", getid(proot));
+ return total;
+
+}
+
+
+void print_root(u64 root, int height, FILE *dot_f)
+{
+ FILE *f;
+ int i;
+ radix_tree_node node;
+ char *style[2] = { "", "style=bold,color=blue," };
+
+ if (dot_f == NULL) {
+ f = fopen("radix.dot", "w");
+ if (f == NULL) {
+ perror("print_root: open");
+ return;
+ }
+
+ /* write graph preamble */
+ fprintf(f, "digraph G {\n");
+
+ /* add a node for this root. */
+ fprintf(f, " n%Ld [%sshape=box,label=\"%Ld\"];\n",
+ getid(root), style[iswritable(root)], getid(root));
+ }
+
+ printf("print_root(%Ld)\n", getid(root));
+
+ /* base case */
+ if (height == 0) {
+ /* add a node and edge for each child root */
+ node = (radix_tree_node) readblock(getid(root));
+ if (node == NULL)
+ return;
+
+ for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++) {
+ if (node[i] != ZERO) {
+ fprintf(f, " n%Ld [%sshape=box,label=\"%Ld\"];\n",
+ getid(node[i]), style[iswritable(node[i])],
+ getid(node[i]));
+ fprintf(f, " n%Ld -> n%Ld [label=\"%d\"]\n", getid(root),
+ getid(node[i]), i);
+ }
+ }
+ freeblock(node);
+ return;
+ }
+
+ /* the root block may be smaller to ensure all leaves are full */
+ height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
+
+ if (getid(root) == ZERO)
+ return;
+
+ node = (radix_tree_node) readblock(getid(root));
+ if (node == NULL)
+ return;
+
+ /* add a node and edge for each child root */
+ for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++)
+ if (node[i] != ZERO) {
+ fprintf(f, " n%Ld [%sshape=box,label=\"%Ld\"];\n",
+ getid(node[i]), style[iswritable(node[i])],
+ getid(node[i]));
+
+ print_root(node[i], height-RADIX_TREE_MAP_SHIFT, f);
+ fprintf(f, " n%Ld -> n%Ld [label=\"%d\"]\n", getid(root),
+ getid(node[i]), i);
+ }
+
+ freeblock(node);
+
+ /* write graph postamble */
+ if (dot_f == NULL) {
+ fprintf(f, "}\n");
+ fclose(f);
+ }
+}
+
+#ifdef RADIX_STANDALONE
+
+int main(int argc, char **argv) {
+ u64 key = ZERO, val = ZERO;
+ u64 root = writable(2ULL);
+ u64 p = ZERO, c = ZERO;
+ int v;
+ char buff[4096];
+
+ __init_blockstore();
+
+ memset(buff, 0, 4096);
+ /*fp = open("radix.dat", O_RDWR | O_CREAT, 0644);
+
+ if (fp < 3) {
+ perror("open");
+ return -1;
+ }
+ if (lseek(fp, 0, SEEK_END) == 0) {
+ write(fp, buff, 4096);
+ }*/
+
+ allocblock(buff);
+
+ printf("Recognized commands:\n"
+ "Note: the LSB of a node number indicates if it is writable\n"
+ " root <node> set root to <node>\n"
+ " snapshot take a snapshot of the root\n"
+ " set <key> <val> set key=val\n"
+ " get <key> query key\n"
+ " c <proot> <croot> collapse\n"
+ " pr print tree to dot\n"
+ " pf <1=verbose> print freelist\n"
+ " quit\n"
+ "\nroot = %Ld\n", root);
+ for (;;) {
+ //print_root(root, 34, NULL);
+ //system("dot radix.dot -Tps -o radix.ps");
+
+ printf("> ");
+ fflush(stdout);
+ fgets(buff, 1024, stdin);
+ if (feof(stdin))
+ break;
+ if (sscanf(buff, " root %Ld", &root) == 1) {
+ printf("root set to %Ld\n", root);
+ } else if (sscanf(buff, " set %Ld %Ld", &key, &val) == 2) {
+ root = update(34, root, key, val);
+ printf("root = %Ld\n", root);
+ } else if (sscanf(buff, " c %Ld %Ld", &p, &c) == 2) {
+ v = collapse(34, p, c);
+ printf("reclaimed %d blocks.\n", v);
+ } else if (sscanf(buff, " get %Ld", &key) == 1) {
+ val = lookup(34, root, key);
+ printf("value = %Ld\n", val);
+ } else if (!strcmp(buff, "quit\n")) {
+ break;
+ } else if (!strcmp(buff, "snapshot\n")) {
+ root = snapshot(root);
+ printf("new root = %Ld\n", root);
+ } else if (sscanf(buff, " pr %Ld", &root) == 1) {
+ print_root(root, 34, NULL);
+ } else if (sscanf(buff, " pf %d", &v) == 1) {
+ freelist_count(v);
+ } else if (!strcmp(buff, "pf\n")) {
+ freelist_count(0);
+ } else {
+ printf("command not recognized\n");
+ }
+ }
+ return 0;
+}
+
+#endif
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/radix.h
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/radix.h Sun Jul 3 14:14:09 2005
@@ -0,0 +1,45 @@
+/*
+ * Radix tree for mapping (up to) 63-bit virtual block IDs to
+ * 63-bit global block IDs
+ *
+ * Pointers within the tree set aside the least significant bit to indicate
+ * whther or not the target block is writable from this node.
+ *
+ * The block with ID 0 is assumed to be an empty block of all zeros
+ */
+
+#ifndef __RADIX_H__
+#define __RADIX_H__
+
+/* I don't really like exposing these, but... */
+#define getid(x) (((x)>>1)&0x7fffffffffffffffLL)
+#define putid(x) ((x)<<1)
+#define writable(x) (((x)<<1)|1LL)
+#define iswritable(x) ((x)&1LL)
+#define ZERO 0LL
+#define ONE 1LL
+#define ONEMASK 0xffffffffffffffeLL
+
+#define RADIX_TREE_MAP_SHIFT 9
+#define RADIX_TREE_MAP_MASK 0x1ff
+#define RADIX_TREE_MAP_ENTRIES 512
+
+typedef u64 *radix_tree_node;
+
+
+/*
+ * main api
+ * with these functions, the LSB of root always indicates
+ * whether or not the block is writable, including the return
+ * values of update and snapshot
+ */
+u64 lookup(int height, u64 root, u64 key);
+u64 update(int height, u64 root, u64 key, u64 val);
+u64 snapshot(u64 root);
+int collapse(int height, u64 proot, u64 croot);
+int isprivate(int height, u64 root, u64 key);
+
+
+void __rcache_init(void);
+
+#endif /* __RADIX_H__ */
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax/blockstored.c
--- /dev/null Sun Jul 3 12:02:01 2005
+++ b/tools/blktap/parallax/blockstored.c Sun Jul 3 14:14:09 2005
@@ -0,0 +1,276 @@
+/**************************************************************************
+ *
+ * blockstored.c
+ *
+ * Block store daemon.
+ *
+ */
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <netinet/in.h>
+#include <errno.h>
+#include "blockstore.h"
+
+//#define BSDEBUG
+
+int readblock_into(u64 id, void *block);
+
+int open_socket(u16 port) {
+
+ struct sockaddr_in sn;
+ int sock;
+
+ sock = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock < 0) {
+ perror("Bad socket");
+ return -1;
+ }
+ memset(&sn, 0, sizeof(sn));
+ sn.sin_family = AF_INET;
+ sn.sin_port = htons(port);
+ sn.sin_addr.s_addr = htonl(INADDR_ANY);
+ if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) {
+ perror("bind");
+ close(sock);
+ return -1;
+ }
+
+ return sock;
+}
+
+static int block_fp = -1;
+static int bssock = -1;
+
+int send_reply(struct sockaddr_in *peer, void *buffer, int len) {
+
+ int rc;
+
+#ifdef BSDEBUG
+ fprintf(stdout, "TX: %u bytes op=%u id=0x%llx\n",
+ len, ((bsmsg_t *)buffer)->hdr.operation, ((bsmsg_t
*)buffer)->hdr.id);
+#endif
+ rc = sendto(bssock, buffer, len, 0, (struct sockaddr *)peer,
sizeof(*peer));
+ if (rc < 0) {
+ perror("send_reply");
+ return 1;
+ }
+
+
+ return 0;
+}
+
+static bsmsg_t msgbuf;
+
+void service_loop(void) {
+
+ for (;;) {
+ int rc, len;
+ struct sockaddr_in from;
+ size_t slen = sizeof(from);
+ u64 bid;
+
+ len = recvfrom(bssock, (void *)&msgbuf, sizeof(msgbuf), 0,
+ (struct sockaddr *)&from, &slen);
+
+ if (len < 0) {
+ perror("recvfrom");
+ continue;
+ }
+
+ if (len < MSGBUFSIZE_OP) {
+ fprintf(stderr, "Short packet.\n");
+ continue;
+ }
+
+#ifdef BSDEBUG
+ fprintf(stdout, "RX: %u bytes op=%u id=0x%llx\n",
+ len, msgbuf.hdr.operation, msgbuf.hdr.id);
+#endif
+
+ switch (msgbuf.hdr.operation) {
+ case BSOP_READBLOCK:
+ if (len < MSGBUFSIZE_ID) {
+ fprintf(stderr, "Short packet (readblock %u).\n", len);
+ continue;
+ }
+ rc = readblock_into(msgbuf.hdr.id, msgbuf.block);
+ if (rc < 0) {
+ fprintf(stderr, "readblock error\n");
+ msgbuf.hdr.flags = BSOP_FLAG_ERROR;
+ send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
+ continue;
+ }
+ msgbuf.hdr.flags = 0;
+ send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_BLOCK);
+ break;
+ case BSOP_WRITEBLOCK:
+ if (len < MSGBUFSIZE_BLOCK) {
+ fprintf(stderr, "Short packet (writeblock %u).\n", len);
+ continue;
+ }
+ rc = writeblock(msgbuf.hdr.id, msgbuf.block);
+ if (rc < 0) {
+ fprintf(stderr, "writeblock error\n");
+ msgbuf.hdr.flags = BSOP_FLAG_ERROR;
+ send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
+ continue;
+ }
+ msgbuf.hdr.flags = 0;
+ send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
+ break;
+ case BSOP_ALLOCBLOCK:
+ if (len < MSGBUFSIZE_BLOCK) {
+ fprintf(stderr, "Short packet (allocblock %u).\n", len);
+ continue;
+ }
+ bid = allocblock(msgbuf.block);
+ if (bid == ALLOCFAIL) {
+ fprintf(stderr, "allocblock error\n");
+ msgbuf.hdr.flags = BSOP_FLAG_ERROR;
+ send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
+ continue;
+ }
+ msgbuf.hdr.id = bid;
+ msgbuf.hdr.flags = 0;
+ send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
+ break;
+ }
+
+ }
+}
+
+/**
+ * readblock: read a block from disk
+ * @id: block id to read
+ * @block: pointer to buffer to receive block
+ *
+ * @return: 0 if OK, other on error
+ */
+
+int readblock_into(u64 id, void *block) {
+ if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
+ printf ("%Ld\n", (id - 1) * BLOCK_SIZE);
+ perror("readblock lseek");
+ return -1;
+ }
+ if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
+ perror("readblock read");
+ return -1;
+ }
+ return 0;
+}
+
+/**
+ * writeblock: write an existing block to disk
+ * @id: block id
+ * @block: pointer to block
+ *
+ * @return: zero on success, -1 on failure
+ */
+int writeblock(u64 id, void *block) {
+ if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
+ perror("writeblock lseek");
+ return -1;
+ }
+ if (write(block_fp, block, BLOCK_SIZE) < 0) {
+ perror("writeblock write");
+ return -1;
+ }
+ return 0;
+}
+
+/**
+ * allocblock: write a new block to disk
+ * @block: pointer to block
+ *
+ * @return: new id of block on disk
+ */
+static u64 lastblock = 0;
+
+u64 allocblock(void *block) {
+ u64 lb;
+ off64_t pos;
+
+ retry:
+ pos = lseek64(block_fp, 0, SEEK_END);
+ if (pos == (off64_t)-1) {
+ perror("allocblock lseek");
+ return ALLOCFAIL;
+ }
+ if (pos % BLOCK_SIZE != 0) {
+ fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE);
+ return ALLOCFAIL;
+ }
+ if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
+ perror("allocblock write");
+ return ALLOCFAIL;
+ }
+ lb = pos / BLOCK_SIZE + 1;
+
+#ifdef BS_ALLOC_HACK
+ if (lb < BS_ALLOC_SKIP)
+ goto retry;
+#endif
+
+ if (lb <= lastblock)
+ printf("[*** %Ld alredy allocated! ***]\n", lb);
+
+ lastblock = lb;
+ return lb;
+}
+
+/**
+ * newblock: get a new in-memory block set to zeros
+ *
+ * @return: pointer to new block, NULL on error
+ */
+void *newblock() {
+ void *block = malloc(BLOCK_SIZE);
+ if (block == NULL) {
+ perror("newblock");
+ return NULL;
+ }
+ memset(block, 0, BLOCK_SIZE);
+ return block;
+}
+
+
+/**
+ * freeblock: unallocate an in-memory block
+ * @id: block id (zero if this is only in-memory)
+ * @block: block to be freed
+ */
+void freeblock(void *block) {
+ if (block != NULL)
+ free(block);
+}
+
+
+int main(int argc, char **argv)
+{
+ block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
+
+ if (block_fp < 0) {
+ perror("open");
+ return -1;
+ }
+
+ bssock = open_socket(BLOCKSTORED_PORT);
+ if (bssock < 0) {
+ return -1;
+ }
+
+ service_loop();
+
+ close(bssock);
+
+ return 0;
+}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/README-PARALLAX
--- a/tools/blktap/README-PARALLAX Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,177 +0,0 @@
-Parallax Quick Overview
-March 3, 2005
-
-This is intended to provide a quick set of instructions to let you
-guys play with the current parallax source. In it's current form, the
-code will let you run an arbitrary number of VMs off of a single disk
-image, doing copy-on-write as they make updates. Each domain is
-assigned a virtual disk image (VDI), which may be based on a snapshot
-of an existing image. All of the VDI and snapshot management should
-currently work.
-
-The current implementation uses a single file as a blockstore for
-_everything_ this will soon be replaced by the fancier backend code
-and the local cache. As it stands, Parallax will create
-"blockstore.dat" in the directory that you run it from, and use
-largefile support to make this grow to unfathomable girth. So, you
-probably want to run the daemon off of a local disk, with a lot of
-free space.
-
-Here's how to get going:
-
-0. Setup:
----------
-
-Pick a local directory on a disk with lots of room. You should be
-running from a privileged domain (e.g. dom0) with the blocktap
-configured in and block backend NOT.
-
-For convenience (for the moment) copy all of the vdi tools (vdi_*) and
-the parallax daemon from tools/blktap into this directory.
-
-1. Populate the blockstore:
----------------------------
-
-First you need to put at least one image into the blockstore. You
-will need a disk image, either as a file or local partition. My
-general approach has been to
-
-(a) make a really big sparse file with
-
- dd if=/dev/zero of=./image bs=4K count=1 seek=[big value]
-
-(b) put a filesystem into it
-
- mkfs.ext3 ./image
-
-(c) mount it using loopback
-
- mkdir ./mnt
- mount -o loop ./image
-
-(d) cd into it and untar one of the image files from srg-roots.
-
- cd mnt
- tar ...
-
-NOTE: Beware if your system is FC3. mkfs is not compatible with old
-versions of fedora, and so you don't have much choice but to install
-further fc3 images if you have used the fc3 version of mkfs.
-
-(e) unmount the image
-
- cd ..
- umount mnt
-
-(f) now, create a new VDI to hold the image
-
- ./vdi_create "My new FC3 VDI"
-
-(g) get the id of the new VDI.
-
- ./vdi_list
-
- | 0 My new FC3 VDI
-
-(0 is the VDI id... create a few more if you want.)
-
-(h) hoover your image into the new VDI.
-
- ./vdi_fill 0 ./image
-
-This will pull the entire image into the blockstore and set up a
-mapping tree for it for VDI 0. Passing a device (i.e. /dev/sda3)
-should also work, but vdi_fill has NO notion of sparseness yet, so you
-are going to pump a block into the store for each block you read.
-
-vdi_fill will count up until it is done, and you should be ready to
-go. If you want to be anal, you can use vdi_validate to test the VDI
-against the original image.
-
-2. Create some extra VDIs
--------------------------
-
-VDIs are actually a list of snapshots, and each snapshot is a full
-image of mappings. So, to preserve an immutable copy of a current
-VDI, do this:
-
-(a) Snapshot your new VDI.
-
- ./vdi_snap 0
-
-Snapshotting writes the current radix root to the VDI's snapshot log,
-and assigns it a new writable root.
-
-(b) look at the VDI's snapshot log.
-
- ./vdi_snap_list 0
-
- | 16 0 Thu Mar 3 19:27:48 2005 565111 31
-
-The first two columns constitute a snapshot id and represent the
-(block, offset) of the snapshot record. The Date tells you when the
-snapshot was made, and 31 is the radix root node of the snapshot.
-
-(c) Create a new VDI, based on that snapshot, and look at the list.
-
- ./vdi_create "FC3 - Copy 1" 16 0
- ./vdi_list
-
- | 0 My new FC3 VDI
- | 1 FC3 - Copy 1
-
-NOTE: If you have Graphviz installed on your system, you can use
-vdi_tree to generate a postscript of your current set of VDIs and
-snapshots.
-
-
-Create as many VDIs as you need for the VMs that you want to run.
-
-3. Boot some VMs:
------------------
-
-Parallax currently uses a hack in xend to pass the VDI id, you need to
-modify the disk line of the VM config that is going to mount it.
-
-(a) set up your vm config, by using the following disk line:
-
- disk = ['parallax:1,sda1,w,0' ]
-
-This example uses VDI 1 (from vdi_list above), presents it as sda1
-(writable), and uses dom 0 as the backend. If you were running the
-daemon (and tap driver) in some domain other than 0, you would change
-this last parameter.
-
-NOTE: You'll need to have reinstalled xend/tools prior to booting the vm, so
that it knows what to do with "parallax:".
-
-(b) Run parallax in the backend domain.
-
- ./parallax
-
-(c) create your new domain.
-
- xm create ...
-
----
-
-That's pretty much all there is to it at the moment. Hope this is
-clear enough to get you going. Now, a few serious caveats that will
-be sorted out in the almost immediate future:
-
-WARNINGS:
----------
-
-1. There is NO locking in the VDI tools at the moment, so I'd avoid
-running them in parallel, or more importantly, running them while the
-daemon is running.
-
-2. I doubt that xend will be very happy about restarting if you have
-parallax-using domains. So if it dies while there are active parallax
-doms, you may need to reboot.
-
-3. I've turned off write-in-place. So at the moment, EVERY block
-write is a log append on the blockstore. I've been having some probs
-with the radix tree's marking of writable blocks after snapshots and
-will sort this out very soon.
-
-
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/block-async.c
--- a/tools/blktap/block-async.c Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,393 +0,0 @@
-/* block-async.c
- *
- * Asynchronous block wrappers for parallax.
- */
-
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <pthread.h>
-#include "block-async.h"
-#include "blockstore.h"
-#include "vdi.h"
-
-
-#if 0
-#define DPRINTF(_f, _a...) printf ( _f , ## _a )
-#else
-#define DPRINTF(_f, _a...) ((void)0)
-#endif
-
-/* We have a queue of outstanding I/O requests implemented as a
- * circular producer-consumer ring with free-running buffers.
- * to allow reordering, this ring indirects to indexes in an
- * ring of io_structs.
- *
- * the block_* calls may either add an entry to this ring and return,
- * or satisfy the request immediately and call the callback directly.
- * None of the io calls in parallax should be nested enough to worry
- * about stack problems with this approach.
- */
-
-struct read_args {
- u64 addr;
-};
-
-struct write_args {
- u64 addr;
- char *block;
-};
-
-struct alloc_args {
- char *block;
-};
-
-struct pending_io_req {
- enum {IO_READ, IO_WRITE, IO_ALLOC, IO_RWAKE, IO_WWAKE} op;
- union {
- struct read_args r;
- struct write_args w;
- struct alloc_args a;
- } u;
- io_cb_t cb;
- void *param;
-};
-
-void radix_lock_init(struct radix_lock *r)
-{
- int i;
-
- pthread_mutex_init(&r->lock, NULL);
- for (i=0; i < 1024; i++) {
- r->lines[i] = 0;
- r->waiters[i] = NULL;
- r->state[i] = ANY;
- }
-}
-
-/* maximum outstanding I/O requests issued asynchronously */
-/* must be a power of 2.*/
-#define MAX_PENDING_IO 1024
-
-/* how many threads to concurrently issue I/O to the disk. */
-#define IO_POOL_SIZE 10
-
-static struct pending_io_req pending_io_reqs[MAX_PENDING_IO];
-static int pending_io_list[MAX_PENDING_IO];
-static unsigned long io_prod = 0, io_cons = 0, io_free = 0;
-#define PENDING_IO_MASK(_x) ((_x) & (MAX_PENDING_IO - 1))
-#define PENDING_IO_IDX(_x) ((_x) - pending_io_reqs)
-#define PENDING_IO_ENT(_x) \
- (&pending_io_reqs[pending_io_list[PENDING_IO_MASK(_x)]])
-#define CAN_PRODUCE_PENDING_IO ((io_free + MAX_PENDING_IO) != io_prod)
-#define CAN_CONSUME_PENDING_IO (io_cons != io_prod)
-static pthread_mutex_t pending_io_lock = PTHREAD_MUTEX_INITIALIZER;
-static pthread_cond_t pending_io_cond = PTHREAD_COND_INITIALIZER;
-
-static void init_pending_io(void)
-{
- int i;
-
- for (i=0; i<MAX_PENDING_IO; i++)
- pending_io_list[i] = i;
-
-}
-
-void block_read(u64 addr, io_cb_t cb, void *param)
-{
- struct pending_io_req *req;
-
- pthread_mutex_lock(&pending_io_lock);
- assert(CAN_PRODUCE_PENDING_IO);
-
- req = PENDING_IO_ENT(io_prod++);
- DPRINTF("Produce (R) %lu (%p)\n", io_prod - 1, req);
- req->op = IO_READ;
- req->u.r.addr = addr;
- req->cb = cb;
- req->param = param;
-
- pthread_cond_signal(&pending_io_cond);
- pthread_mutex_unlock(&pending_io_lock);
-}
-
-
-void block_write(u64 addr, char *block, io_cb_t cb, void *param)
-{
- struct pending_io_req *req;
-
- pthread_mutex_lock(&pending_io_lock);
- assert(CAN_PRODUCE_PENDING_IO);
-
- req = PENDING_IO_ENT(io_prod++);
- DPRINTF("Produce (W) %lu (%p)\n", io_prod - 1, req);
- req->op = IO_WRITE;
- req->u.w.addr = addr;
- req->u.w.block = block;
- req->cb = cb;
- req->param = param;
-
- pthread_cond_signal(&pending_io_cond);
- pthread_mutex_unlock(&pending_io_lock);
-}
-
-
-void block_alloc(char *block, io_cb_t cb, void *param)
-{
- struct pending_io_req *req;
-
- pthread_mutex_lock(&pending_io_lock);
- assert(CAN_PRODUCE_PENDING_IO);
-
- req = PENDING_IO_ENT(io_prod++);
- req->op = IO_ALLOC;
- req->u.a.block = block;
- req->cb = cb;
- req->param = param;
-
- pthread_cond_signal(&pending_io_cond);
- pthread_mutex_unlock(&pending_io_lock);
-}
-
-void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
-{
- struct io_ret ret;
- pthread_mutex_lock(&r->lock);
-
- if (( r->lines[row] >= 0 ) && (r->state[row] != STOP)) {
- r->lines[row]++;
- r->state[row] = READ;
- DPRINTF("RLOCK : %3d (row: %d)\n", r->lines[row], row);
- pthread_mutex_unlock(&r->lock);
- ret.type = IO_INT_T;
- ret.u.i = 0;
- cb(ret, param);
- } else {
- struct radix_wait **rwc;
- struct radix_wait *rw =
- (struct radix_wait *) malloc (sizeof(struct radix_wait));
- DPRINTF("RLOCK : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row);
- rw->type = RLOCK;
- rw->param = param;
- rw->cb = cb;
- rw->next = NULL;
- /* append to waiters list. */
- rwc = &r->waiters[row];
- while (*rwc != NULL) rwc = &(*rwc)->next;
- *rwc = rw;
- pthread_mutex_unlock(&r->lock);
- return;
- }
-}
-
-
-void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
-{
- struct io_ret ret;
- pthread_mutex_lock(&r->lock);
-
- /* the second check here is redundant -- just here for debugging now. */
- if ((r->state[row] == ANY) && ( r->lines[row] == 0 )) {
- r->state[row] = STOP;
- r->lines[row] = -1;
- DPRINTF("WLOCK : %3d (row: %d)\n", r->lines[row], row);
- pthread_mutex_unlock(&r->lock);
- ret.type = IO_INT_T;
- ret.u.i = 0;
- cb(ret, param);
- } else {
- struct radix_wait **rwc;
- struct radix_wait *rw =
- (struct radix_wait *) malloc (sizeof(struct radix_wait));
- DPRINTF("WLOCK : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row);
- rw->type = WLOCK;
- rw->param = param;
- rw->cb = cb;
- rw->next = NULL;
- /* append to waiters list. */
- rwc = &r->waiters[row];
- while (*rwc != NULL) rwc = &(*rwc)->next;
- *rwc = rw;
- pthread_mutex_unlock(&r->lock);
- return;
- }
-
-}
-
-/* called with radix_lock locked and lock count of zero. */
-static void wake_waiters(struct radix_lock *r, int row)
-{
- struct pending_io_req *req;
- struct radix_wait *rw;
-
- if (r->lines[row] != 0) return;
- if (r->waiters[row] == NULL) return;
-
- if (r->waiters[row]->type == WLOCK) {
-
- rw = r->waiters[row];
- pthread_mutex_lock(&pending_io_lock);
- assert(CAN_PRODUCE_PENDING_IO);
-
- req = PENDING_IO_ENT(io_prod++);
- req->op = IO_WWAKE;
- req->cb = rw->cb;
- req->param = rw->param;
- r->lines[row] = -1; /* write lock the row. */
- r->state[row] = STOP;
- r->waiters[row] = rw->next;
- free(rw);
- pthread_mutex_unlock(&pending_io_lock);
-
- } else /* RLOCK */ {
-
- while ((r->waiters[row] != NULL) && (r->waiters[row]->type == RLOCK)) {
- rw = r->waiters[row];
- pthread_mutex_lock(&pending_io_lock);
- assert(CAN_PRODUCE_PENDING_IO);
-
- req = PENDING_IO_ENT(io_prod++);
- req->op = IO_RWAKE;
- req->cb = rw->cb;
- req->param = rw->param;
- r->lines[row]++; /* read lock the row. */
- r->state[row] = READ;
- r->waiters[row] = rw->next;
- free(rw);
- pthread_mutex_unlock(&pending_io_lock);
- }
-
- if (r->waiters[row] != NULL) /* There is a write queued still */
- r->state[row] = STOP;
- }
-
- pthread_mutex_lock(&pending_io_lock);
- pthread_cond_signal(&pending_io_cond);
- pthread_mutex_unlock(&pending_io_lock);
-}
-
-void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
-{
- struct io_ret ret;
-
- pthread_mutex_lock(&r->lock);
- assert(r->lines[row] > 0); /* try to catch misuse. */
- r->lines[row]--;
- if (r->lines[row] == 0) {
- r->state[row] = ANY;
- wake_waiters(r, row);
- }
- pthread_mutex_unlock(&r->lock);
- cb(ret, param);
-}
-
-void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
-{
- struct io_ret ret;
-
- pthread_mutex_lock(&r->lock);
- assert(r->lines[row] == -1); /* try to catch misuse. */
- r->lines[row] = 0;
- r->state[row] = ANY;
- wake_waiters(r, row);
- pthread_mutex_unlock(&r->lock);
- cb(ret, param);
-}
-
-/* consumer calls */
-static void do_next_io_req(struct pending_io_req *req)
-{
- struct io_ret ret;
- void *param;
-
- switch (req->op) {
- case IO_READ:
- ret.type = IO_BLOCK_T;
- ret.u.b = readblock(req->u.r.addr);
- break;
- case IO_WRITE:
- ret.type = IO_INT_T;
- ret.u.i = writeblock(req->u.w.addr, req->u.w.block);
- DPRINTF("wrote %d at %Lu\n", *(int *)(req->u.w.block), req->u.w.addr);
- break;
- case IO_ALLOC:
- ret.type = IO_ADDR_T;
- ret.u.a = allocblock(req->u.a.block);
- break;
- case IO_RWAKE:
- DPRINTF("WAKE DEFERRED RLOCK!\n");
- ret.type = IO_INT_T;
- ret.u.i = 0;
- break;
- case IO_WWAKE:
- DPRINTF("WAKE DEFERRED WLOCK!\n");
- ret.type = IO_INT_T;
- ret.u.i = 0;
- break;
- default:
- DPRINTF("Unknown IO operation on pending list!\n");
- return;
- }
-
- param = req->param;
- pthread_mutex_lock(&pending_io_lock);
- pending_io_list[PENDING_IO_MASK(io_free++)] = PENDING_IO_IDX(req);
- pthread_mutex_unlock(&pending_io_lock);
-
- assert(req->cb != NULL);
- req->cb(ret, param);
-
-}
-
-void *io_thread(void *param)
-{
- int tid;
- struct pending_io_req *req;
-
- /* Set this thread's tid. */
- tid = *(int *)param;
- free(param);
-
-start:
- pthread_mutex_lock(&pending_io_lock);
- while (io_prod == io_cons) {
- pthread_cond_wait(&pending_io_cond, &pending_io_lock);
- }
-
- if (io_prod == io_cons) {
- /* unnecessary wakeup. */
- pthread_mutex_unlock(&pending_io_lock);
- goto start;
- }
-
- req = PENDING_IO_ENT(io_cons++);
- pthread_mutex_unlock(&pending_io_lock);
-
- do_next_io_req(req);
-
- goto start;
-
-}
-
-static pthread_t io_pool[IO_POOL_SIZE];
-void start_io_threads(void)
-
-{
- int i, tid=0;
-
- for (i=0; i < IO_POOL_SIZE; i++) {
- int ret, *t;
- t = (int *)malloc(sizeof(int));
- *t = tid++;
- ret = pthread_create(&io_pool[i], NULL, io_thread, t);
- if (ret != 0) printf("Error starting thread %d\n", i);
- }
-
-}
-
-void init_block_async(void)
-{
- init_pending_io();
- start_io_threads();
-}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/block-async.h
--- a/tools/blktap/block-async.h Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,69 +0,0 @@
-/* block-async.h
- *
- * Asynchronous block wrappers for parallax.
- */
-
-#ifndef _BLOCKASYNC_H_
-#define _BLOCKASYNC_H_
-
-#include <assert.h>
-#include <xc.h>
-#include "vdi.h"
-
-struct io_ret
-{
- enum {IO_ADDR_T, IO_BLOCK_T, IO_INT_T} type;
- union {
- u64 a;
- char *b;
- int i;
- } u;
-};
-
-typedef void (*io_cb_t)(struct io_ret r, void *param);
-
-/* per-vdi lock structures to make sure requests run in a safe order. */
-struct radix_wait {
- enum {RLOCK, WLOCK} type;
- io_cb_t cb;
- void *param;
- struct radix_wait *next;
-};
-
-struct radix_lock {
- pthread_mutex_t lock;
- int lines[1024];
- struct radix_wait *waiters[1024];
- enum {ANY, READ, STOP} state[1024];
-};
-void radix_lock_init(struct radix_lock *r);
-
-void block_read(u64 addr, io_cb_t cb, void *param);
-void block_write(u64 addr, char *block, io_cb_t cb, void *param);
-void block_alloc(char *block, io_cb_t cb, void *param);
-void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
-void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
-void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
-void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
-void init_block_async(void);
-
-static inline u64 IO_ADDR(struct io_ret r)
-{
- assert(r.type == IO_ADDR_T);
- return r.u.a;
-}
-
-static inline char *IO_BLOCK(struct io_ret r)
-{
- assert(r.type == IO_BLOCK_T);
- return r.u.b;
-}
-
-static inline int IO_INT(struct io_ret r)
-{
- assert(r.type == IO_INT_T);
- return r.u.i;
-}
-
-
-#endif //_BLOCKASYNC_H_
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/blockstore.c
--- a/tools/blktap/blockstore.c Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,1350 +0,0 @@
-/**************************************************************************
- *
- * blockstore.c
- *
- * Simple block store interface
- *
- */
-
-#include <fcntl.h>
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <stdarg.h>
-#include "blockstore.h"
-#include <pthread.h>
-
-//#define BLOCKSTORE_REMOTE
-//#define BSDEBUG
-
-#define RETRY_TIMEOUT 1000000 /* microseconds */
-
-/*****************************************************************************
- * Debugging
- */
-#ifdef BSDEBUG
-void DB(char *format, ...)
-{
- va_list args;
- fprintf(stderr, "[%05u] ", (int)pthread_getspecific(tid_key));
- va_start(args, format);
- vfprintf(stderr, format, args);
- va_end(args);
-}
-#else
-#define DB(format, ...) (void)0
-#endif
-
-#ifdef BLOCKSTORE_REMOTE
-
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#include <netinet/in.h>
-#include <netdb.h>
-
-/*****************************************************************************
- * Network state *
- *****************************************************************************/
-
-/* The individual disk servers we talks to. These will be referenced by
- * an integer index into bsservers[].
- */
-bsserver_t bsservers[MAX_SERVERS];
-
-/* The cluster map. This is indexed by an integer cluster number.
- */
-bscluster_t bsclusters[MAX_CLUSTERS];
-
-/* Local socket.
- */
-struct sockaddr_in sin_local;
-int bssock = 0;
-
-/*****************************************************************************
- * Notification *
- *****************************************************************************/
-
-typedef struct pool_thread_t_struct {
- pthread_mutex_t ptmutex;
- pthread_cond_t ptcv;
- int newdata;
-} pool_thread_t;
-
-pool_thread_t pool_thread[READ_POOL_SIZE+1];
-
-#define RECV_NOTIFY(tid) { \
- pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \
- pool_thread[tid].newdata = 1; \
- DB("CV Waking %u", tid); \
- pthread_cond_signal(&(pool_thread[tid].ptcv)); \
- pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); }
-#define RECV_AWAIT(tid) { \
- pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \
- if (pool_thread[tid].newdata) { \
- pool_thread[tid].newdata = 0; \
- DB("CV Woken %u", tid); \
- } \
- else { \
- DB("CV Waiting %u", tid); \
- pthread_cond_wait(&(pool_thread[tid].ptcv), \
- &(pool_thread[tid].ptmutex)); \
- } \
- pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); }
-
-/*****************************************************************************
- * Message queue management *
- *****************************************************************************/
-
-/* Protects the queue manipulation critcal regions.
- */
-pthread_mutex_t ptmutex_queue;
-#define ENTER_QUEUE_CR pthread_mutex_lock(&ptmutex_queue)
-#define LEAVE_QUEUE_CR pthread_mutex_unlock(&ptmutex_queue)
-
-pthread_mutex_t ptmutex_recv;
-#define ENTER_RECV_CR pthread_mutex_lock(&ptmutex_recv)
-#define LEAVE_RECV_CR pthread_mutex_unlock(&ptmutex_recv)
-
-/* A message queue entry. We allocate one of these for every request we send.
- * Asynchronous reply reception also used one of these.
- */
-typedef struct bsq_t_struct {
- struct bsq_t_struct *prev;
- struct bsq_t_struct *next;
- int status;
- int server;
- int length;
- struct msghdr msghdr;
- struct iovec iov[2];
- int tid;
- struct timeval tv_sent;
- bshdr_t message;
- void *block;
-} bsq_t;
-
-#define BSQ_STATUS_MATCHED 1
-
-pthread_mutex_t ptmutex_luid;
-#define ENTER_LUID_CR pthread_mutex_lock(&ptmutex_luid)
-#define LEAVE_LUID_CR pthread_mutex_unlock(&ptmutex_luid)
-
-static u64 luid_cnt = 0x1000ULL;
-u64 new_luid(void) {
- u64 luid;
- ENTER_LUID_CR;
- luid = luid_cnt++;
- LEAVE_LUID_CR;
- return luid;
-}
-
-/* Queue of outstanding requests.
- */
-bsq_t *bs_head = NULL;
-bsq_t *bs_tail = NULL;
-int bs_qlen = 0;
-
-/*
- */
-void queuedebug(char *msg) {
- bsq_t *q;
- ENTER_QUEUE_CR;
- fprintf(stderr, "Q: %s len=%u\n", msg, bs_qlen);
- for (q = bs_head; q; q = q->next) {
- fprintf(stderr, " luid=%016llx server=%u\n",
- q->message.luid, q->server);
- }
- LEAVE_QUEUE_CR;
-}
-
-int enqueue(bsq_t *qe) {
- ENTER_QUEUE_CR;
- qe->next = NULL;
- qe->prev = bs_tail;
- if (!bs_head)
- bs_head = qe;
- else
- bs_tail->next = qe;
- bs_tail = qe;
- bs_qlen++;
- LEAVE_QUEUE_CR;
-#ifdef BSDEBUG
- queuedebug("enqueue");
-#endif
- return 0;
-}
-
-int dequeue(bsq_t *qe) {
- bsq_t *q;
- ENTER_QUEUE_CR;
- for (q = bs_head; q; q = q->next) {
- if (q == qe) {
- if (q->prev)
- q->prev->next = q->next;
- else
- bs_head = q->next;
- if (q->next)
- q->next->prev = q->prev;
- else
- bs_tail = q->prev;
- bs_qlen--;
- goto found;
- }
- }
-
- LEAVE_QUEUE_CR;
-#ifdef BSDEBUG
- queuedebug("dequeue not found");
-#endif
- return 0;
-
- found:
- LEAVE_QUEUE_CR;
-#ifdef BSDEBUG
- queuedebug("dequeue not found");
-#endif
- return 1;
-}
-
-bsq_t *queuesearch(bsq_t *qe) {
- bsq_t *q;
- ENTER_QUEUE_CR;
- for (q = bs_head; q; q = q->next) {
- if ((qe->server == q->server) &&
- (qe->message.operation == q->message.operation) &&
- (qe->message.luid == q->message.luid)) {
-
- if ((q->message.operation == BSOP_READBLOCK) &&
- ((q->message.flags & BSOP_FLAG_ERROR) == 0)) {
- q->block = qe->block;
- qe->block = NULL;
- }
- q->length = qe->length;
- q->message.flags = qe->message.flags;
- q->message.id = qe->message.id;
- q->status |= BSQ_STATUS_MATCHED;
-
- if (q->prev)
- q->prev->next = q->next;
- else
- bs_head = q->next;
- if (q->next)
- q->next->prev = q->prev;
- else
- bs_tail = q->prev;
- q->next = NULL;
- q->prev = NULL;
- bs_qlen--;
- goto found;
- }
- }
-
- LEAVE_QUEUE_CR;
-#ifdef BSDEBUG
- queuedebug("queuesearch not found");
-#endif
- return NULL;
-
- found:
- LEAVE_QUEUE_CR;
-#ifdef BSDEBUG
- queuedebug("queuesearch found");
-#endif
- return q;
-}
-
-/*****************************************************************************
- * Network communication *
- *****************************************************************************/
-
-int send_message(bsq_t *qe) {
- int rc;
-
- qe->msghdr.msg_name = (void *)&(bsservers[qe->server].sin);
- qe->msghdr.msg_namelen = sizeof(struct sockaddr_in);
- qe->msghdr.msg_iov = qe->iov;
- if (qe->block)
- qe->msghdr.msg_iovlen = 2;
- else
- qe->msghdr.msg_iovlen = 1;
- qe->msghdr.msg_control = NULL;
- qe->msghdr.msg_controllen = 0;
- qe->msghdr.msg_flags = 0;
-
- qe->iov[0].iov_base = (void *)&(qe->message);
- qe->iov[0].iov_len = MSGBUFSIZE_ID;
-
- if (qe->block) {
- qe->iov[1].iov_base = qe->block;
- qe->iov[1].iov_len = BLOCK_SIZE;
- }
-
- qe->message.luid = new_luid();
-
- qe->status = 0;
- qe->tid = (int)pthread_getspecific(tid_key);
- if (enqueue(qe) < 0) {
- fprintf(stderr, "Error enqueuing request.\n");
- return -1;
- }
-
- gettimeofday(&(qe->tv_sent), NULL);
- DB("send_message to %d luid=%016llx\n", qe->server, qe->message.luid);
- rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
- //rc = sendto(bssock, (void *)&(qe->message), qe->length, 0,
- // (struct sockaddr *)&(bsservers[qe->server].sin),
- // sizeof(struct sockaddr_in));
- if (rc < 0)
- return rc;
-
- return rc;
-}
-
-int recv_message(bsq_t *qe) {
- struct sockaddr_in from;
- //int flen = sizeof(from);
- int rc;
-
- qe->msghdr.msg_name = &from;
- qe->msghdr.msg_namelen = sizeof(struct sockaddr_in);
- qe->msghdr.msg_iov = qe->iov;
- if (qe->block)
- qe->msghdr.msg_iovlen = 2;
- else
- qe->msghdr.msg_iovlen = 1;
- qe->msghdr.msg_control = NULL;
- qe->msghdr.msg_controllen = 0;
- qe->msghdr.msg_flags = 0;
-
- qe->iov[0].iov_base = (void *)&(qe->message);
- qe->iov[0].iov_len = MSGBUFSIZE_ID;
- if (qe->block) {
- qe->iov[1].iov_base = qe->block;
- qe->iov[1].iov_len = BLOCK_SIZE;
- }
-
- rc = recvmsg(bssock, &(qe->msghdr), 0);
-
- //return recvfrom(bssock, (void *)&(qe->message), sizeof(bsmsg_t), 0,
- // (struct sockaddr *)&from, &flen);
- return rc;
-}
-
-int get_server_number(struct sockaddr_in *sin) {
- int i;
-
-#ifdef BSDEBUG2
- fprintf(stderr,
- "get_server_number(%u.%u.%u.%u/%u)\n",
- (unsigned int)sin->sin_addr.s_addr & 0xff,
- ((unsigned int)sin->sin_addr.s_addr >> 8) & 0xff,
- ((unsigned int)sin->sin_addr.s_addr >> 16) & 0xff,
- ((unsigned int)sin->sin_addr.s_addr >> 24) & 0xff,
- (unsigned int)sin->sin_port);
-#endif
-
- for (i = 0; i < MAX_SERVERS; i++) {
- if (bsservers[i].hostname) {
-#ifdef BSDEBUG2
- fprintf(stderr,
- "get_server_number check %u.%u.%u.%u/%u\n",
- (unsigned int)bsservers[i].sin.sin_addr.s_addr&0xff,
- ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 8)&0xff,
- ((unsigned int)bsservers[i].sin.sin_addr.s_addr >>
16)&0xff,
- ((unsigned int)bsservers[i].sin.sin_addr.s_addr >>
24)&0xff,
- (unsigned int)bsservers[i].sin.sin_port);
-#endif
- if ((sin->sin_family == bsservers[i].sin.sin_family) &&
- (sin->sin_port == bsservers[i].sin.sin_port) &&
- (memcmp((void *)&(sin->sin_addr),
- (void *)&(bsservers[i].sin.sin_addr),
- sizeof(struct in_addr)) == 0)) {
- return i;
- }
- }
- }
-
- return -1;
-}
-
-void *rx_buffer = NULL;
-bsq_t rx_qe;
-bsq_t *recv_any(void) {
- struct sockaddr_in from;
- int rc;
-
- DB("ENTER recv_any\n");
-
- rx_qe.msghdr.msg_name = &from;
- rx_qe.msghdr.msg_namelen = sizeof(struct sockaddr_in);
- rx_qe.msghdr.msg_iov = rx_qe.iov;
- if (!rx_buffer) {
- rx_buffer = malloc(BLOCK_SIZE);
- if (!rx_buffer) {
- perror("recv_any malloc");
- return NULL;
- }
- }
- rx_qe.block = rx_buffer;
- rx_buffer = NULL;
- rx_qe.msghdr.msg_iovlen = 2;
- rx_qe.msghdr.msg_control = NULL;
- rx_qe.msghdr.msg_controllen = 0;
- rx_qe.msghdr.msg_flags = 0;
-
- rx_qe.iov[0].iov_base = (void *)&(rx_qe.message);
- rx_qe.iov[0].iov_len = MSGBUFSIZE_ID;
- rx_qe.iov[1].iov_base = rx_qe.block;
- rx_qe.iov[1].iov_len = BLOCK_SIZE;
-
- rc = recvmsg(bssock, &(rx_qe.msghdr), 0);
- if (rc < 0) {
- perror("recv_any");
- return NULL;
- }
-
- rx_qe.length = rc;
- rx_qe.server = get_server_number(&from);
-
- DB("recv_any from %d luid=%016llx len=%u\n",
- rx_qe.server, rx_qe.message.luid, rx_qe.length);
-
- return &rx_qe;
-}
-
-void recv_recycle_buffer(bsq_t *q) {
- if (q->block) {
- rx_buffer = q->block;
- q->block = NULL;
- }
-}
-
-// cycle through reading any incoming, searching for a match in the
-// queue, until we have all we need.
-int wait_recv(bsq_t **reqs, int numreqs) {
- bsq_t *q, *m;
- unsigned int x, i;
- int tid = (int)pthread_getspecific(tid_key);
-
- DB("ENTER wait_recv %u\n", numreqs);
-
- checkmatch:
- x = 0xffffffff;
- for (i = 0; i < numreqs; i++) {
- x &= reqs[i]->status;
- }
- if ((x & BSQ_STATUS_MATCHED)) {
- DB("LEAVE wait_recv\n");
- return numreqs;
- }
-
- RECV_AWAIT(tid);
-
- /*
- rxagain:
- ENTER_RECV_CR;
- q = recv_any();
- LEAVE_RECV_CR;
- if (!q)
- return -1;
-
- m = queuesearch(q);
- recv_recycle_buffer(q);
- if (!m) {
- fprintf(stderr, "Unmatched RX\n");
- goto rxagain;
- }
- */
-
- goto checkmatch;
-
-}
-
-/* retry
- */
-static int retry_count = 0;
-int retry(bsq_t *qe)
-{
- int rc;
- gettimeofday(&(qe->tv_sent), NULL);
- DB("retry to %d luid=%016llx\n", qe->server, qe->message.luid);
- retry_count++;
- rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
- if (rc < 0)
- return rc;
- return 0;
-}
-
-/* queue runner
- */
-void *queue_runner(void *arg)
-{
- for (;;) {
- struct timeval now;
- long long nowus, sus;
- bsq_t *q;
- int r;
-
- sleep(1);
-
- gettimeofday(&now, NULL);
- nowus = now.tv_usec + now.tv_sec * 1000000;
- ENTER_QUEUE_CR;
- r = retry_count;
- for (q = bs_head; q; q = q->next) {
- sus = q->tv_sent.tv_usec + q->tv_sent.tv_sec * 1000000;
- if ((nowus - sus) > RETRY_TIMEOUT) {
- if (retry(q) < 0) {
- fprintf(stderr, "Error on sendmsg retry.\n");
- }
- }
- }
- if (r != retry_count) {
- fprintf(stderr, "RETRIES: %u %u\n", retry_count - r, retry_count);
- }
- LEAVE_QUEUE_CR;
- }
-}
-
-/* receive loop
- */
-void *receive_loop(void *arg)
-{
- bsq_t *q, *m;
-
- for(;;) {
- q = recv_any();
- if (!q) {
- fprintf(stderr, "recv_any error\n");
- }
- else {
- m = queuesearch(q);
- recv_recycle_buffer(q);
- if (!m) {
- fprintf(stderr, "Unmatched RX\n");
- }
- else {
- DB("RX MATCH");
- RECV_NOTIFY(m->tid);
- }
- }
- }
-}
-pthread_t pthread_recv;
-
-/*****************************************************************************
- * Reading *
- *****************************************************************************/
-
-void *readblock_indiv(int server, u64 id) {
- void *block;
- bsq_t *qe;
- int len, rc;
-
- qe = (bsq_t *)malloc(sizeof(bsq_t));
- if (!qe) {
- perror("readblock qe malloc");
- return NULL;
- }
- qe->block = NULL;
-
- /*
- qe->block = malloc(BLOCK_SIZE);
- if (!qe->block) {
- perror("readblock qe malloc");
- free((void *)qe);
- return NULL;
- }
- */
-
- qe->server = server;
-
- qe->message.operation = BSOP_READBLOCK;
- qe->message.flags = 0;
- qe->message.id = id;
- qe->length = MSGBUFSIZE_ID;
-
- if (send_message(qe) < 0) {
- perror("readblock sendto");
- goto err;
- }
-
- /*len = recv_message(qe);
- if (len < 0) {
- perror("readblock recv");
- goto err;
- }*/
-
- rc = wait_recv(&qe, 1);
- if (rc < 0) {
- perror("readblock recv");
- goto err;
- }
-
- if ((qe->message.flags & BSOP_FLAG_ERROR)) {
- fprintf(stderr, "readblock server error\n");
- goto err;
- }
- if (qe->length < MSGBUFSIZE_BLOCK) {
- fprintf(stderr, "readblock recv short (%u)\n", len);
- goto err;
- }
- /* if ((block = malloc(BLOCK_SIZE)) == NULL) {
- perror("readblock malloc");
- goto err;
- }
- memcpy(block, qe->message.block, BLOCK_SIZE);
- */
- block = qe->block;
-
- free((void *)qe);
- return block;
-
- err:
- if (qe->block)
- free(qe->block);
- free((void *)qe);
- return NULL;
-}
-
-/**
- * readblock: read a block from disk
- * @id: block id to read
- *
- * @return: pointer to block, NULL on error
- */
-void *readblock(u64 id) {
- int map = (int)BSID_MAP(id);
- u64 xid;
- static int i = CLUSTER_MAX_REPLICAS - 1;
- void *block = NULL;
-
- /* special case for the "superblock" just use the first block on the
- * first replica. (extend to blocks < 6 for vdi bug)
- */
- if (id < 6) {
- block = readblock_indiv(bsclusters[map].servers[0], id);
- goto out;
- }
-
- i++;
- if (i >= CLUSTER_MAX_REPLICAS)
- i = 0;
- switch (i) {
- case 0:
- xid = BSID_REPLICA0(id);
- break;
- case 1:
- xid = BSID_REPLICA1(id);
- break;
- case 2:
- xid = BSID_REPLICA2(id);
- break;
- }
-
- block = readblock_indiv(bsclusters[map].servers[i], xid);
-
- out:
-#ifdef BSDEBUG
- if (block)
- fprintf(stderr, "READ: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
- id,
- (unsigned int)((unsigned char *)block)[0],
- (unsigned int)((unsigned char *)block)[1],
- (unsigned int)((unsigned char *)block)[2],
- (unsigned int)((unsigned char *)block)[3],
- (unsigned int)((unsigned char *)block)[4],
- (unsigned int)((unsigned char *)block)[5],
- (unsigned int)((unsigned char *)block)[6],
- (unsigned int)((unsigned char *)block)[7]);
- else
- fprintf(stderr, "READ: %016llx NULL\n", id);
-#endif
- return block;
-}
-
-/*****************************************************************************
- * Writing *
- *****************************************************************************/
-
-bsq_t *writeblock_indiv(int server, u64 id, void *block) {
-
- bsq_t *qe;
- int len;
-
- qe = (bsq_t *)malloc(sizeof(bsq_t));
- if (!qe) {
- perror("writeblock qe malloc");
- goto err;
- }
- qe->server = server;
-
- qe->message.operation = BSOP_WRITEBLOCK;
- qe->message.flags = 0;
- qe->message.id = id;
- //memcpy(qe->message.block, block, BLOCK_SIZE);
- qe->block = block;
- qe->length = MSGBUFSIZE_BLOCK;
-
- if (send_message(qe) < 0) {
- perror("writeblock sendto");
- goto err;
- }
-
- return qe;
-
- err:
- free((void *)qe);
- return NULL;
-}
-
-
-/**
- * writeblock: write an existing block to disk
- * @id: block id
- * @block: pointer to block
- *
- * @return: zero on success, -1 on failure
- */
-int writeblock(u64 id, void *block) {
-
- int map = (int)BSID_MAP(id);
- int rep0 = bsclusters[map].servers[0];
- int rep1 = bsclusters[map].servers[1];
- int rep2 = bsclusters[map].servers[2];
- bsq_t *reqs[3];
- int rc;
-
- reqs[0] = reqs[1] = reqs[2] = NULL;
-
-#ifdef BSDEBUG
- fprintf(stderr,
- "WRITE: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
- id,
- (unsigned int)((unsigned char *)block)[0],
- (unsigned int)((unsigned char *)block)[1],
- (unsigned int)((unsigned char *)block)[2],
- (unsigned int)((unsigned char *)block)[3],
- (unsigned int)((unsigned char *)block)[4],
- (unsigned int)((unsigned char *)block)[5],
- (unsigned int)((unsigned char *)block)[6],
- (unsigned int)((unsigned char *)block)[7]);
-#endif
-
- /* special case for the "superblock" just use the first block on the
- * first replica. (extend to blocks < 6 for vdi bug)
- */
- if (id < 6) {
- reqs[0] = writeblock_indiv(rep0, id, block);
- if (!reqs[0])
- return -1;
- rc = wait_recv(reqs, 1);
- return rc;
- }
-
- reqs[0] = writeblock_indiv(rep0, BSID_REPLICA0(id), block);
- if (!reqs[0])
- goto err;
- reqs[1] = writeblock_indiv(rep1, BSID_REPLICA1(id), block);
- if (!reqs[1])
- goto err;
- reqs[2] = writeblock_indiv(rep2, BSID_REPLICA2(id), block);
- if (!reqs[2])
- goto err;
-
- rc = wait_recv(reqs, 3);
- if (rc < 0) {
- perror("writeblock recv");
- goto err;
- }
- if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) {
- fprintf(stderr, "writeblock server0 error\n");
- goto err;
- }
- if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) {
- fprintf(stderr, "writeblock server1 error\n");
- goto err;
- }
- if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) {
- fprintf(stderr, "writeblock server2 error\n");
- goto err;
- }
-
-
- free((void *)reqs[0]);
- free((void *)reqs[1]);
- free((void *)reqs[2]);
- return 0;
-
- err:
- if (reqs[0]) {
- dequeue(reqs[0]);
- free((void *)reqs[0]);
- }
- if (reqs[1]) {
- dequeue(reqs[1]);
- free((void *)reqs[1]);
- }
- if (reqs[2]) {
- dequeue(reqs[2]);
- free((void *)reqs[2]);
- }
- return -1;
-}
-
-/*****************************************************************************
- * Allocation *
- *****************************************************************************/
-
-/**
- * allocblock: write a new block to disk
- * @block: pointer to block
- *
- * @return: new id of block on disk
- */
-u64 allocblock(void *block) {
- return allocblock_hint(block, 0);
-}
-
-bsq_t *allocblock_hint_indiv(int server, void *block, u64 hint) {
- bsq_t *qe;
- int len;
-
- qe = (bsq_t *)malloc(sizeof(bsq_t));
- if (!qe) {
- perror("allocblock_hint qe malloc");
- goto err;
- }
- qe->server = server;
-
- qe->message.operation = BSOP_ALLOCBLOCK;
- qe->message.flags = 0;
- qe->message.id = hint;
- //memcpy(qe->message.block, block, BLOCK_SIZE);
- qe->block = block;
- qe->length = MSGBUFSIZE_BLOCK;
-
- if (send_message(qe) < 0) {
- perror("allocblock_hint sendto");
- goto err;
- }
-
- return qe;
-
- err:
- free((void *)qe);
- return NULL;
-}
-
-/**
- * allocblock_hint: write a new block to disk
- * @block: pointer to block
- * @hint: allocation hint
- *
- * @return: new id of block on disk
- */
-u64 allocblock_hint(void *block, u64 hint) {
- int map = (int)hint;
- int rep0 = bsclusters[map].servers[0];
- int rep1 = bsclusters[map].servers[1];
- int rep2 = bsclusters[map].servers[2];
- bsq_t *reqs[3];
- int rc;
- u64 id0, id1, id2;
-
- reqs[0] = reqs[1] = reqs[2] = NULL;
-
- DB("ENTER allocblock\n");
-
- reqs[0] = allocblock_hint_indiv(rep0, block, hint);
- if (!reqs[0])
- goto err;
- reqs[1] = allocblock_hint_indiv(rep1, block, hint);
- if (!reqs[1])
- goto err;
- reqs[2] = allocblock_hint_indiv(rep2, block, hint);
- if (!reqs[2])
- goto err;
-
- rc = wait_recv(reqs, 3);
- if (rc < 0) {
- perror("allocblock recv");
- goto err;
- }
- if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) {
- fprintf(stderr, "allocblock server0 error\n");
- goto err;
- }
- if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) {
- fprintf(stderr, "allocblock server1 error\n");
- goto err;
- }
- if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) {
- fprintf(stderr, "allocblock server2 error\n");
- goto err;
- }
-
- id0 = reqs[0]->message.id;
- id1 = reqs[1]->message.id;
- id2 = reqs[2]->message.id;
-
-#ifdef BSDEBUG
- fprintf(stderr, "ALLOC: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
- BSID(map, id0, id1, id2),
- (unsigned int)((unsigned char *)block)[0],
- (unsigned int)((unsigned char *)block)[1],
- (unsigned int)((unsigned char *)block)[2],
- (unsigned int)((unsigned char *)block)[3],
- (unsigned int)((unsigned char *)block)[4],
- (unsigned int)((unsigned char *)block)[5],
- (unsigned int)((unsigned char *)block)[6],
- (unsigned int)((unsigned char *)block)[7]);
-#endif
-
- free((void *)reqs[0]);
- free((void *)reqs[1]);
- free((void *)reqs[2]);
- return BSID(map, id0, id1, id2);
-
- err:
- if (reqs[0]) {
- dequeue(reqs[0]);
- free((void *)reqs[0]);
- }
- if (reqs[1]) {
- dequeue(reqs[1]);
- free((void *)reqs[1]);
- }
- if (reqs[2]) {
- dequeue(reqs[2]);
- free((void *)reqs[2]);
- }
- return 0;
-}
-
-#else /* /BLOCKSTORE_REMOTE */
-
-/*****************************************************************************
- * Local storage version *
- *****************************************************************************/
-
-/**
- * readblock: read a block from disk
- * @id: block id to read
- *
- * @return: pointer to block, NULL on error
- */
-
-void *readblock(u64 id) {
- void *block;
- int block_fp;
-
-//printf("readblock(%llu)\n", id);
- block_fp = open("blockstore.dat", O_RDONLY | O_CREAT | O_LARGEFILE, 0644);
-
- if (block_fp < 0) {
- perror("open");
- return NULL;
- }
-
- if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
- printf ("%Ld ", id);
- printf ("%Ld\n", (id - 1) * BLOCK_SIZE);
- perror("readblock lseek");
- goto err;
- }
- if ((block = malloc(BLOCK_SIZE)) == NULL) {
- perror("readblock malloc");
- goto err;
- }
- if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
- perror("readblock read");
- free(block);
- goto err;
- }
- close(block_fp);
- return block;
-
-err:
- close(block_fp);
- return NULL;
-}
-
-/**
- * writeblock: write an existing block to disk
- * @id: block id
- * @block: pointer to block
- *
- * @return: zero on success, -1 on failure
- */
-int writeblock(u64 id, void *block) {
-
- int block_fp;
-
- block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
-
- if (block_fp < 0) {
- perror("open");
- return -1;
- }
-
- if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
- perror("writeblock lseek");
- goto err;
- }
- if (write(block_fp, block, BLOCK_SIZE) < 0) {
- perror("writeblock write");
- goto err;
- }
- close(block_fp);
- return 0;
-
-err:
- close(block_fp);
- return -1;
-}
-
-/**
- * allocblock: write a new block to disk
- * @block: pointer to block
- *
- * @return: new id of block on disk
- */
-
-u64 allocblock(void *block) {
- u64 lb;
- off64_t pos;
- int block_fp;
-
- block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
-
- if (block_fp < 0) {
- perror("open");
- return 0;
- }
-
- pos = lseek64(block_fp, 0, SEEK_END);
- if (pos == (off64_t)-1) {
- perror("allocblock lseek");
- goto err;
- }
- if (pos % BLOCK_SIZE != 0) {
- fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE);
- goto err;
- }
- if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
- perror("allocblock write");
- goto err;
- }
- lb = pos / BLOCK_SIZE + 1;
-//printf("alloc(%Ld)\n", lb);
- close(block_fp);
- return lb;
-
-err:
- close(block_fp);
- return 0;
-
-}
-
-/**
- * allocblock_hint: write a new block to disk
- * @block: pointer to block
- * @hint: allocation hint
- *
- * @return: new id of block on disk
- */
-u64 allocblock_hint(void *block, u64 hint) {
- return allocblock(block);
-}
-
-#endif /* BLOCKSTORE_REMOTE */
-
-/*****************************************************************************
- * Memory management *
- *****************************************************************************/
-
-/**
- * newblock: get a new in-memory block set to zeros
- *
- * @return: pointer to new block, NULL on error
- */
-void *newblock() {
- void *block = malloc(BLOCK_SIZE);
- if (block == NULL) {
- perror("newblock");
- return NULL;
- }
- memset(block, 0, BLOCK_SIZE);
- return block;
-}
-
-
-/**
- * freeblock: unallocate an in-memory block
- * @id: block id (zero if this is only in-memory)
- * @block: block to be freed
- */
-void freeblock(void *block) {
- if (block != NULL)
- free(block);
-}
-
-static freeblock_t *new_freeblock(void)
-{
- freeblock_t *fb;
-
- fb = newblock();
-
- if (fb == NULL) return NULL;
-
- fb->magic = FREEBLOCK_MAGIC;
- fb->next = 0ULL;
- fb->count = 0ULL;
- memset(fb->list, 0, sizeof fb->list);
-
- return fb;
-}
-
-void releaseblock(u64 id)
-{
- blockstore_super_t *bs_super;
- freeblock_t *fl_current;
-
- /* get superblock */
- bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
-
- /* get freeblock_current */
- if (bs_super->freelist_current == 0ULL)
- {
- fl_current = new_freeblock();
- bs_super->freelist_current = allocblock(fl_current);
- writeblock(BLOCKSTORE_SUPER, bs_super);
- } else {
- fl_current = readblock(bs_super->freelist_current);
- }
-
- /* if full, chain to superblock and allocate new current */
-
- if (fl_current->count == FREEBLOCK_SIZE) {
- fl_current->next = bs_super->freelist_full;
- writeblock(bs_super->freelist_current, fl_current);
- bs_super->freelist_full = bs_super->freelist_current;
- freeblock(fl_current);
- fl_current = new_freeblock();
- bs_super->freelist_current = allocblock(fl_current);
- writeblock(BLOCKSTORE_SUPER, bs_super);
- }
-
- /* append id to current */
- fl_current->list[fl_current->count++] = id;
- writeblock(bs_super->freelist_current, fl_current);
-
- freeblock(fl_current);
- freeblock(bs_super);
-
-
-}
-
-/* freelist debug functions: */
-void freelist_count(int print_each)
-{
- blockstore_super_t *bs_super;
- freeblock_t *fb;
- u64 total = 0, next;
-
- bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
-
- if (bs_super->freelist_current == 0ULL) {
- printf("freelist is empty!\n");
- return;
- }
-
- fb = readblock(bs_super->freelist_current);
- printf("%Ld entires on current.\n", fb->count);
- total += fb->count;
- if (print_each == 1)
- {
- int i;
- for (i=0; i< fb->count; i++)
- printf(" %Ld\n", fb->list[i]);
- }
-
- freeblock(fb);
-
- if (bs_super->freelist_full == 0ULL) {
- printf("freelist_full is empty!\n");
- return;
- }
-
- next = bs_super->freelist_full;
- for (;;) {
- fb = readblock(next);
- total += fb->count;
- if (print_each == 1)
- {
- int i;
- for (i=0; i< fb->count; i++)
- printf(" %Ld\n", fb->list[i]);
- }
- next = fb->next;
- freeblock(fb);
- if (next == 0ULL) break;
- }
- printf("Total of %Ld ids on freelist.\n", total);
-}
-
-/*****************************************************************************
- * Initialisation *
- *****************************************************************************/
-
-int __init_blockstore(void)
-{
- int i;
- blockstore_super_t *bs_super;
- u64 ret;
- int block_fp;
-
-#ifdef BLOCKSTORE_REMOTE
- struct hostent *addr;
-
- pthread_mutex_init(&ptmutex_queue, NULL);
- pthread_mutex_init(&ptmutex_luid, NULL);
- pthread_mutex_init(&ptmutex_recv, NULL);
- /*pthread_mutex_init(&ptmutex_notify, NULL);*/
- for (i = 0; i <= READ_POOL_SIZE; i++) {
- pool_thread[i].newdata = 0;
- pthread_mutex_init(&(pool_thread[i].ptmutex), NULL);
- pthread_cond_init(&(pool_thread[i].ptcv), NULL);
- }
-
- bsservers[0].hostname = "firebug.cl.cam.ac.uk";
- bsservers[1].hostname = "planb.cl.cam.ac.uk";
- bsservers[2].hostname = "simcity.cl.cam.ac.uk";
- bsservers[3].hostname = NULL/*"gunfighter.cl.cam.ac.uk"*/;
- bsservers[4].hostname = NULL/*"galaxian.cl.cam.ac.uk"*/;
- bsservers[5].hostname = NULL/*"firetrack.cl.cam.ac.uk"*/;
- bsservers[6].hostname = NULL/*"funfair.cl.cam.ac.uk"*/;
- bsservers[7].hostname = NULL/*"felix.cl.cam.ac.uk"*/;
- bsservers[8].hostname = NULL;
- bsservers[9].hostname = NULL;
- bsservers[10].hostname = NULL;
- bsservers[11].hostname = NULL;
- bsservers[12].hostname = NULL;
- bsservers[13].hostname = NULL;
- bsservers[14].hostname = NULL;
- bsservers[15].hostname = NULL;
-
- for (i = 0; i < MAX_SERVERS; i++) {
- if (!bsservers[i].hostname)
- continue;
- addr = gethostbyname(bsservers[i].hostname);
- if (!addr) {
- perror("bad hostname");
- return -1;
- }
- bsservers[i].sin.sin_family = addr->h_addrtype;
- bsservers[i].sin.sin_port = htons(BLOCKSTORED_PORT);
- bsservers[i].sin.sin_addr.s_addr =
- ((struct in_addr *)(addr->h_addr))->s_addr;
- }
-
- /* Cluster map
- */
- bsclusters[0].servers[0] = 0;
- bsclusters[0].servers[1] = 1;
- bsclusters[0].servers[2] = 2;
- bsclusters[1].servers[0] = 1;
- bsclusters[1].servers[1] = 2;
- bsclusters[1].servers[2] = 3;
- bsclusters[2].servers[0] = 2;
- bsclusters[2].servers[1] = 3;
- bsclusters[2].servers[2] = 4;
- bsclusters[3].servers[0] = 3;
- bsclusters[3].servers[1] = 4;
- bsclusters[3].servers[2] = 5;
- bsclusters[4].servers[0] = 4;
- bsclusters[4].servers[1] = 5;
- bsclusters[4].servers[2] = 6;
- bsclusters[5].servers[0] = 5;
- bsclusters[5].servers[1] = 6;
- bsclusters[5].servers[2] = 7;
- bsclusters[6].servers[0] = 6;
- bsclusters[6].servers[1] = 7;
- bsclusters[6].servers[2] = 0;
- bsclusters[7].servers[0] = 7;
- bsclusters[7].servers[1] = 0;
- bsclusters[7].servers[2] = 1;
-
- /* Local socket set up
- */
- bssock = socket(AF_INET, SOCK_DGRAM, 0);
- if (bssock < 0) {
- perror("Bad socket");
- return -1;
- }
- memset(&sin_local, 0, sizeof(sin_local));
- sin_local.sin_family = AF_INET;
- sin_local.sin_port = htons(BLOCKSTORED_PORT);
- sin_local.sin_addr.s_addr = htonl(INADDR_ANY);
- if (bind(bssock, (struct sockaddr *)&sin_local, sizeof(sin_local)) < 0) {
- perror("bind");
- close(bssock);
- return -1;
- }
-
- pthread_create(&pthread_recv, NULL, receive_loop, NULL);
- pthread_create(&pthread_recv, NULL, queue_runner, NULL);
-
-#else /* /BLOCKSTORE_REMOTE */
- block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
-
- if (block_fp < 0) {
- perror("open");
- return -1;
- exit(-1);
- }
-
- if (lseek(block_fp, 0, SEEK_END) == 0) {
- bs_super = newblock();
- bs_super->magic = BLOCKSTORE_MAGIC;
- bs_super->freelist_full = 0LL;
- bs_super->freelist_current = 0LL;
-
- ret = allocblock(bs_super);
-
- freeblock(bs_super);
- } else {
- bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
- if (bs_super->magic != BLOCKSTORE_MAGIC)
- {
- printf("BLOCKSTORE IS CORRUPT! (no magic in superblock!)\n");
- exit(-1);
- }
- freeblock(bs_super);
- }
-
- close(block_fp);
-
-#endif /* BLOCKSTORE_REMOTE */
- return 0;
-}
-
-void __exit_blockstore(void)
-{
- int i;
-#ifdef BLOCKSTORE_REMOTE
- pthread_mutex_destroy(&ptmutex_recv);
- pthread_mutex_destroy(&ptmutex_luid);
- pthread_mutex_destroy(&ptmutex_queue);
- /*pthread_mutex_destroy(&ptmutex_notify);
- pthread_cond_destroy(&ptcv_notify);*/
- for (i = 0; i <= READ_POOL_SIZE; i++) {
- pthread_mutex_destroy(&(pool_thread[i].ptmutex));
- pthread_cond_destroy(&(pool_thread[i].ptcv));
- }
-#endif
-}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/blockstore.h
--- a/tools/blktap/blockstore.h Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,134 +0,0 @@
-/**************************************************************************
- *
- * blockstore.h
- *
- * Simple block store interface
- *
- */
-
-#ifndef __BLOCKSTORE_H__
-#define __BLOCKSTORE_H__
-
-#include <netinet/in.h>
-#include <xc.h>
-
-#define BLOCK_SIZE 4096
-#define BLOCK_SHIFT 12
-#define BLOCK_MASK 0xfffffffffffff000LL
-
-/* XXX SMH: where is the below supposed to be defined???? */
-#ifndef SECTOR_SHIFT
-#define SECTOR_SHIFT 9
-#endif
-
-#define FREEBLOCK_SIZE (BLOCK_SIZE / sizeof(u64)) - (3 * sizeof(u64))
-#define FREEBLOCK_MAGIC 0x0fee0fee0fee0feeULL
-
-typedef struct {
- u64 magic;
- u64 next;
- u64 count;
- u64 list[FREEBLOCK_SIZE];
-} freeblock_t;
-
-#define BLOCKSTORE_MAGIC 0xaaaaaaa00aaaaaaaULL
-#define BLOCKSTORE_SUPER 1ULL
-
-typedef struct {
- u64 magic;
- u64 freelist_full;
- u64 freelist_current;
-} blockstore_super_t;
-
-extern void *newblock();
-extern void *readblock(u64 id);
-extern u64 allocblock(void *block);
-extern u64 allocblock_hint(void *block, u64 hint);
-extern int writeblock(u64 id, void *block);
-
-/* Add this blockid to a freelist, to be recycled by the allocator. */
-extern void releaseblock(u64 id);
-
-/* this is a memory free() operation for block-sized allocations */
-extern void freeblock(void *block);
-extern int __init_blockstore(void);
-
-/* debug for freelist. */
-void freelist_count(int print_each);
-#define ALLOCFAIL (((u64)(-1)))
-
-/* Distribution
- */
-#define BLOCKSTORED_PORT 9346
-
-struct bshdr_t_struct {
- u32 operation;
- u32 flags;
- u64 id;
- u64 luid;
-} __attribute__ ((packed));
-typedef struct bshdr_t_struct bshdr_t;
-
-struct bsmsg_t_struct {
- bshdr_t hdr;
- unsigned char block[BLOCK_SIZE];
-} __attribute__ ((packed));
-
-typedef struct bsmsg_t_struct bsmsg_t;
-
-#define MSGBUFSIZE_OP sizeof(u32)
-#define MSGBUFSIZE_FLAGS (sizeof(u32) + sizeof(u32))
-#define MSGBUFSIZE_ID (sizeof(u32) + sizeof(u32) + sizeof(u64) +
sizeof(u64))
-#define MSGBUFSIZE_BLOCK sizeof(bsmsg_t)
-
-#define BSOP_READBLOCK 0x01
-#define BSOP_WRITEBLOCK 0x02
-#define BSOP_ALLOCBLOCK 0x03
-#define BSOP_FREEBLOCK 0x04
-
-#define BSOP_FLAG_ERROR 0x01
-
-#define BS_ALLOC_SKIP 10
-#define BS_ALLOC_HACK
-
-/* Remote hosts and cluster map - XXX need to generalise
- */
-
-/*
-
- Interim ID format is
-
- 63 60 59 40 39 20 19 0
- +----+--------------------+--------------------+--------------------+
- |map | replica 2 | replica 1 | replica 0 |
- +----+--------------------+--------------------+--------------------+
-
- The map is an index into a table detailing which machines form the
- cluster.
-
- */
-
-#define BSID_REPLICA0(_id) ((_id)&0xfffffULL)
-#define BSID_REPLICA1(_id) (((_id)>>20)&0xfffffULL)
-#define BSID_REPLICA2(_id) (((_id)>>40)&0xfffffULL)
-#define BSID_MAP(_id) (((_id)>>60)&0xfULL)
-
-#define BSID(_map, _rep0, _rep1, _rep2) ((((u64)(_map))<<60) | \
- (((u64)(_rep2))<<40) | \
- (((u64)(_rep1))<<20) | ((u64)(_rep0)))
-
-typedef struct bsserver_t_struct {
- char *hostname;
- struct sockaddr_in sin;
-} bsserver_t;
-
-#define MAX_SERVERS 16
-
-#define CLUSTER_MAX_REPLICAS 3
-typedef struct bscluster_t_struct {
- int servers[CLUSTER_MAX_REPLICAS];
-} bscluster_t;
-
-#define MAX_CLUSTERS 16
-
-#endif /* __BLOCKSTORE_H__ */
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/blockstored.c
--- a/tools/blktap/blockstored.c Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,276 +0,0 @@
-/**************************************************************************
- *
- * blockstored.c
- *
- * Block store daemon.
- *
- */
-
-#include <fcntl.h>
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#include <netinet/in.h>
-#include <errno.h>
-#include "blockstore.h"
-
-//#define BSDEBUG
-
-int readblock_into(u64 id, void *block);
-
-int open_socket(u16 port) {
-
- struct sockaddr_in sn;
- int sock;
-
- sock = socket(AF_INET, SOCK_DGRAM, 0);
- if (sock < 0) {
- perror("Bad socket");
- return -1;
- }
- memset(&sn, 0, sizeof(sn));
- sn.sin_family = AF_INET;
- sn.sin_port = htons(port);
- sn.sin_addr.s_addr = htonl(INADDR_ANY);
- if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) {
- perror("bind");
- close(sock);
- return -1;
- }
-
- return sock;
-}
-
-static int block_fp = -1;
-static int bssock = -1;
-
-int send_reply(struct sockaddr_in *peer, void *buffer, int len) {
-
- int rc;
-
-#ifdef BSDEBUG
- fprintf(stdout, "TX: %u bytes op=%u id=0x%llx\n",
- len, ((bsmsg_t *)buffer)->hdr.operation, ((bsmsg_t
*)buffer)->hdr.id);
-#endif
- rc = sendto(bssock, buffer, len, 0, (struct sockaddr *)peer,
sizeof(*peer));
- if (rc < 0) {
- perror("send_reply");
- return 1;
- }
-
-
- return 0;
-}
-
-static bsmsg_t msgbuf;
-
-void service_loop(void) {
-
- for (;;) {
- int rc, len;
- struct sockaddr_in from;
- size_t slen = sizeof(from);
- u64 bid;
-
- len = recvfrom(bssock, (void *)&msgbuf, sizeof(msgbuf), 0,
- (struct sockaddr *)&from, &slen);
-
- if (len < 0) {
- perror("recvfrom");
- continue;
- }
-
- if (len < MSGBUFSIZE_OP) {
- fprintf(stderr, "Short packet.\n");
- continue;
- }
-
-#ifdef BSDEBUG
- fprintf(stdout, "RX: %u bytes op=%u id=0x%llx\n",
- len, msgbuf.hdr.operation, msgbuf.hdr.id);
-#endif
-
- switch (msgbuf.hdr.operation) {
- case BSOP_READBLOCK:
- if (len < MSGBUFSIZE_ID) {
- fprintf(stderr, "Short packet (readblock %u).\n", len);
- continue;
- }
- rc = readblock_into(msgbuf.hdr.id, msgbuf.block);
- if (rc < 0) {
- fprintf(stderr, "readblock error\n");
- msgbuf.hdr.flags = BSOP_FLAG_ERROR;
- send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
- continue;
- }
- msgbuf.hdr.flags = 0;
- send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_BLOCK);
- break;
- case BSOP_WRITEBLOCK:
- if (len < MSGBUFSIZE_BLOCK) {
- fprintf(stderr, "Short packet (writeblock %u).\n", len);
- continue;
- }
- rc = writeblock(msgbuf.hdr.id, msgbuf.block);
- if (rc < 0) {
- fprintf(stderr, "writeblock error\n");
- msgbuf.hdr.flags = BSOP_FLAG_ERROR;
- send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
- continue;
- }
- msgbuf.hdr.flags = 0;
- send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
- break;
- case BSOP_ALLOCBLOCK:
- if (len < MSGBUFSIZE_BLOCK) {
- fprintf(stderr, "Short packet (allocblock %u).\n", len);
- continue;
- }
- bid = allocblock(msgbuf.block);
- if (bid == ALLOCFAIL) {
- fprintf(stderr, "allocblock error\n");
- msgbuf.hdr.flags = BSOP_FLAG_ERROR;
- send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
- continue;
- }
- msgbuf.hdr.id = bid;
- msgbuf.hdr.flags = 0;
- send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
- break;
- }
-
- }
-}
-
-/**
- * readblock: read a block from disk
- * @id: block id to read
- * @block: pointer to buffer to receive block
- *
- * @return: 0 if OK, other on error
- */
-
-int readblock_into(u64 id, void *block) {
- if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
- printf ("%Ld\n", (id - 1) * BLOCK_SIZE);
- perror("readblock lseek");
- return -1;
- }
- if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
- perror("readblock read");
- return -1;
- }
- return 0;
-}
-
-/**
- * writeblock: write an existing block to disk
- * @id: block id
- * @block: pointer to block
- *
- * @return: zero on success, -1 on failure
- */
-int writeblock(u64 id, void *block) {
- if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
- perror("writeblock lseek");
- return -1;
- }
- if (write(block_fp, block, BLOCK_SIZE) < 0) {
- perror("writeblock write");
- return -1;
- }
- return 0;
-}
-
-/**
- * allocblock: write a new block to disk
- * @block: pointer to block
- *
- * @return: new id of block on disk
- */
-static u64 lastblock = 0;
-
-u64 allocblock(void *block) {
- u64 lb;
- off64_t pos;
-
- retry:
- pos = lseek64(block_fp, 0, SEEK_END);
- if (pos == (off64_t)-1) {
- perror("allocblock lseek");
- return ALLOCFAIL;
- }
- if (pos % BLOCK_SIZE != 0) {
- fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE);
- return ALLOCFAIL;
- }
- if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
- perror("allocblock write");
- return ALLOCFAIL;
- }
- lb = pos / BLOCK_SIZE + 1;
-
-#ifdef BS_ALLOC_HACK
- if (lb < BS_ALLOC_SKIP)
- goto retry;
-#endif
-
- if (lb <= lastblock)
- printf("[*** %Ld alredy allocated! ***]\n", lb);
-
- lastblock = lb;
- return lb;
-}
-
-/**
- * newblock: get a new in-memory block set to zeros
- *
- * @return: pointer to new block, NULL on error
- */
-void *newblock() {
- void *block = malloc(BLOCK_SIZE);
- if (block == NULL) {
- perror("newblock");
- return NULL;
- }
- memset(block, 0, BLOCK_SIZE);
- return block;
-}
-
-
-/**
- * freeblock: unallocate an in-memory block
- * @id: block id (zero if this is only in-memory)
- * @block: block to be freed
- */
-void freeblock(void *block) {
- if (block != NULL)
- free(block);
-}
-
-
-int main(int argc, char **argv)
-{
- block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
-
- if (block_fp < 0) {
- perror("open");
- return -1;
- }
-
- bssock = open_socket(BLOCKSTORED_PORT);
- if (bssock < 0) {
- return -1;
- }
-
- service_loop();
-
- close(bssock);
-
- return 0;
-}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/bstest.c
--- a/tools/blktap/bstest.c Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,191 +0,0 @@
-/**************************************************************************
- *
- * bstest.c
- *
- * Block store daemon test program.
- *
- * usage: bstest <host>|X {r|w|a} ID
- *
- */
-
-#include <fcntl.h>
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#include <netinet/in.h>
-#include <netdb.h>
-#include <errno.h>
-#include "blockstore.h"
-
-int direct(char *host, u32 op, u64 id, int len) {
- struct sockaddr_in sn, peer;
- int sock;
- bsmsg_t msgbuf;
- int rc, slen;
- struct hostent *addr;
-
- addr = gethostbyname(host);
- if (!addr) {
- perror("bad hostname");
- exit(1);
- }
- peer.sin_family = addr->h_addrtype;
- peer.sin_port = htons(BLOCKSTORED_PORT);
- peer.sin_addr.s_addr = ((struct in_addr *)(addr->h_addr))->s_addr;
- fprintf(stderr, "Sending to: %u.%u.%u.%u\n",
- (unsigned int)(unsigned char)addr->h_addr[0],
- (unsigned int)(unsigned char)addr->h_addr[1],
- (unsigned int)(unsigned char)addr->h_addr[2],
- (unsigned int)(unsigned char)addr->h_addr[3]);
-
- sock = socket(AF_INET, SOCK_DGRAM, 0);
- if (sock < 0) {
- perror("Bad socket");
- exit(1);
- }
- memset(&sn, 0, sizeof(sn));
- sn.sin_family = AF_INET;
- sn.sin_port = htons(BLOCKSTORED_PORT);
- sn.sin_addr.s_addr = htonl(INADDR_ANY);
- if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) {
- perror("bind");
- close(sock);
- exit(1);
- }
-
- memset((void *)&msgbuf, 0, sizeof(msgbuf));
- msgbuf.operation = op;
- msgbuf.id = id;
-
- rc = sendto(sock, (void *)&msgbuf, len, 0,
- (struct sockaddr *)&peer, sizeof(peer));
- if (rc < 0) {
- perror("sendto");
- exit(1);
- }
-
- slen = sizeof(peer);
- len = recvfrom(sock, (void *)&msgbuf, sizeof(msgbuf), 0,
- (struct sockaddr *)&peer, &slen);
- if (len < 0) {
- perror("recvfrom");
- exit(1);
- }
-
- printf("Reply %u bytes:\n", len);
- if (len >= MSGBUFSIZE_OP)
- printf(" operation: %u\n", msgbuf.operation);
- if (len >= MSGBUFSIZE_FLAGS)
- printf(" flags: 0x%x\n", msgbuf.flags);
- if (len >= MSGBUFSIZE_ID)
- printf(" id: %llu\n", msgbuf.id);
- if (len >= (MSGBUFSIZE_ID + 4))
- printf(" data: %02x %02x %02x %02x...\n",
- (unsigned int)msgbuf.block[0],
- (unsigned int)msgbuf.block[1],
- (unsigned int)msgbuf.block[2],
- (unsigned int)msgbuf.block[3]);
-
- if (sock > 0)
- close(sock);
-
- return 0;
-}
-
-int main (int argc, char **argv) {
-
- u32 op = 0;
- u64 id = 0;
- int len = 0, rc;
- void *block;
-
- if (argc < 3) {
- fprintf(stderr, "usage: bstest <host>|X {r|w|a} ID\n");
- return 1;
- }
-
- switch (argv[2][0]) {
- case 'r':
- case 'R':
- op = BSOP_READBLOCK;
- len = MSGBUFSIZE_ID;
- break;
- case 'w':
- case 'W':
- op = BSOP_WRITEBLOCK;
- len = MSGBUFSIZE_BLOCK;
- break;
- case 'a':
- case 'A':
- op = BSOP_ALLOCBLOCK;
- len = MSGBUFSIZE_BLOCK;
- break;
- default:
- fprintf(stderr, "Unknown action '%s'.\n", argv[2]);
- return 1;
- }
-
- if (argc >= 4)
- id = atoll(argv[3]);
-
- if (strcmp(argv[1], "X") == 0) {
- rc = __init_blockstore();
- if (rc < 0) {
- fprintf(stderr, "blockstore init failed.\n");
- return 1;
- }
- switch(op) {
- case BSOP_READBLOCK:
- block = readblock(id);
- if (block) {
- printf("data: %02x %02x %02x %02x...\n",
- (unsigned int)((unsigned char*)block)[0],
- (unsigned int)((unsigned char*)block)[1],
- (unsigned int)((unsigned char*)block)[2],
- (unsigned int)((unsigned char*)block)[3]);
- }
- break;
- case BSOP_WRITEBLOCK:
- block = malloc(BLOCK_SIZE);
- if (!block) {
- perror("bstest malloc");
- return 1;
- }
- memset(block, 0, BLOCK_SIZE);
- rc = writeblock(id, block);
- if (rc != 0) {
- printf("error\n");
- }
- else {
- printf("OK\n");
- }
- break;
- case BSOP_ALLOCBLOCK:
- block = malloc(BLOCK_SIZE);
- if (!block) {
- perror("bstest malloc");
- return 1;
- }
- memset(block, 0, BLOCK_SIZE);
- id = allocblock_hint(block, id);
- if (id == 0) {
- printf("error\n");
- }
- else {
- printf("ID: %llu\n", id);
- }
- break;
- }
- }
- else {
- direct(argv[1], op, id, len);
- }
-
-
- return 0;
-}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/parallax.c
--- a/tools/blktap/parallax.c Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,611 +0,0 @@
-/**************************************************************************
- *
- * parallax.c
- *
- * The Parallax Storage Server
- *
- */
-
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <pthread.h>
-#include "blktaplib.h"
-#include "blockstore.h"
-#include "vdi.h"
-#include "block-async.h"
-#include "requests-async.h"
-
-#define PARALLAX_DEV 61440
-#define SECTS_PER_NODE 8
-
-
-#if 0
-#define DPRINTF(_f, _a...) printf ( _f , ## _a )
-#else
-#define DPRINTF(_f, _a...) ((void)0)
-#endif
-
-/* ------[ session records ]----------------------------------------------- */
-
-#define BLKIF_HASHSZ 1024
-#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1))
-
-#define VDI_HASHSZ 16
-#define VDI_HASH(_vd) ((((_vd)>>8)^(_vd))&(VDI_HASHSZ-1))
-
-typedef struct blkif {
- domid_t domid;
- unsigned int handle;
- enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
- vdi_t *vdi_hash[VDI_HASHSZ];
- struct blkif *hash_next;
-} blkif_t;
-
-static blkif_t *blkif_hash[BLKIF_HASHSZ];
-
-blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle)
-{
- if ( handle != 0 )
- printf("blktap/parallax don't currently support non-0 dev handles!\n");
-
- blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)];
- while ( (blkif != NULL) &&
- ((blkif->domid != domid) || (blkif->handle != handle)) )
- blkif = blkif->hash_next;
- return blkif;
-}
-
-vdi_t *blkif_get_vdi(blkif_t *blkif, blkif_vdev_t device)
-{
- vdi_t *vdi = blkif->vdi_hash[VDI_HASH(device)];
-
- while ((vdi != NULL) && (vdi->vdevice != device))
- vdi = vdi->next;
-
- return vdi;
-}
-
-/* ------[ control message handling ]-------------------------------------- */
-
-void blkif_create(blkif_be_create_t *create)
-{
- domid_t domid = create->domid;
- unsigned int handle = create->blkif_handle;
- blkif_t **pblkif, *blkif;
-
- DPRINTF("parallax (blkif_create): create is %p\n", create);
-
- if ( (blkif = (blkif_t *)malloc(sizeof(blkif_t))) == NULL )
- {
- DPRINTF("Could not create blkif: out of memory\n");
- create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
- return;
- }
-
- memset(blkif, 0, sizeof(*blkif));
- blkif->domid = domid;
- blkif->handle = handle;
- blkif->status = DISCONNECTED;
-
- pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
- while ( *pblkif != NULL )
- {
- if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) )
- {
- DPRINTF("Could not create blkif: already exists (%d,%d)\n",
- domid, handle);
- create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS;
- free(blkif);
- return;
- }
- pblkif = &(*pblkif)->hash_next;
- }
-
- blkif->hash_next = *pblkif;
- *pblkif = blkif;
-
- DPRINTF("Successfully created blkif\n");
- create->status = BLKIF_BE_STATUS_OKAY;
-}
-
-void blkif_destroy(blkif_be_destroy_t *destroy)
-{
- domid_t domid = destroy->domid;
- unsigned int handle = destroy->blkif_handle;
- blkif_t **pblkif, *blkif;
-
- DPRINTF("parallax (blkif_destroy): destroy is %p\n", destroy);
-
- pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
- while ( (blkif = *pblkif) != NULL )
- {
- if ( (blkif->domid == domid) && (blkif->handle == handle) )
- {
- if ( blkif->status != DISCONNECTED )
- goto still_connected;
- goto destroy;
- }
- pblkif = &blkif->hash_next;
- }
-
- destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
- return;
-
- still_connected:
- destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
- return;
-
- destroy:
- *pblkif = blkif->hash_next;
- free(blkif);
- destroy->status = BLKIF_BE_STATUS_OKAY;
-}
-
-void vbd_create(blkif_be_vbd_create_t *create)
-{
- blkif_t *blkif;
- vdi_t *vdi, **vdip;
- blkif_vdev_t vdevice = create->vdevice;
-
- DPRINTF("parallax (vbd_create): create=%p\n", create);
-
- blkif = blkif_find_by_handle(create->domid, create->blkif_handle);
- if ( blkif == NULL )
- {
- DPRINTF("vbd_create attempted for non-existent blkif (%u,%u)\n",
- create->domid, create->blkif_handle);
- create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
- return;
- }
-
- /* VDI identifier is in grow->extent.sector_start */
- DPRINTF("vbd_create: create->dev_handle (id) is %lx\n",
- (unsigned long)create->dev_handle);
-
- vdi = vdi_get(create->dev_handle);
- if (vdi == NULL)
- {
- printf("parallax (vbd_create): VDI %lx not found.\n",
- (unsigned long)create->dev_handle);
- create->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
- return;
- }
-
- vdi->next = NULL;
- vdi->vdevice = vdevice;
- vdip = &blkif->vdi_hash[VDI_HASH(vdevice)];
- while (*vdip != NULL)
- vdip = &(*vdip)->next;
- *vdip = vdi;
-
- DPRINTF("blkif_create succeeded\n");
- create->status = BLKIF_BE_STATUS_OKAY;
-}
-
-void vbd_destroy(blkif_be_vbd_destroy_t *destroy)
-{
- blkif_t *blkif;
- vdi_t *vdi, **vdip;
- blkif_vdev_t vdevice = destroy->vdevice;
-
- blkif = blkif_find_by_handle(destroy->domid, destroy->blkif_handle);
- if ( blkif == NULL )
- {
- DPRINTF("vbd_destroy attempted for non-existent blkif (%u,%u)\n",
- destroy->domid, destroy->blkif_handle);
- destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
- return;
- }
-
- vdip = &blkif->vdi_hash[VDI_HASH(vdevice)];
- while ((*vdip != NULL) && ((*vdip)->vdevice != vdevice))
- vdip = &(*vdip)->next;
-
- if (*vdip != NULL)
- {
- vdi = *vdip;
- *vdip = vdi->next;
- vdi_put(vdi);
- }
-
-}
-
-int parallax_control(control_msg_t *msg)
-{
- domid_t domid;
- int ret;
-
- DPRINTF("parallax_control: msg is %p\n", msg);
-
- if (msg->type != CMSG_BLKIF_BE)
- {
- printf("Unexpected control message (%d)\n", msg->type);
- return 0;
- }
-
- switch(msg->subtype)
- {
- case CMSG_BLKIF_BE_CREATE:
- if ( msg->length != sizeof(blkif_be_create_t) )
- goto parse_error;
- blkif_create((blkif_be_create_t *)msg->msg);
- break;
-
- case CMSG_BLKIF_BE_DESTROY:
- if ( msg->length != sizeof(blkif_be_destroy_t) )
- goto parse_error;
- blkif_destroy((blkif_be_destroy_t *)msg->msg);
- break;
-
- case CMSG_BLKIF_BE_VBD_CREATE:
- if ( msg->length != sizeof(blkif_be_vbd_create_t) )
- goto parse_error;
- vbd_create((blkif_be_vbd_create_t *)msg->msg);
- break;
-
- case CMSG_BLKIF_BE_VBD_DESTROY:
- if ( msg->length != sizeof(blkif_be_vbd_destroy_t) )
- goto parse_error;
- vbd_destroy((blkif_be_vbd_destroy_t *)msg->msg);
- break;
-
- case CMSG_BLKIF_BE_CONNECT:
- case CMSG_BLKIF_BE_DISCONNECT:
- /* we don't manage the device channel, the tap does. */
- break;
-
- default:
- goto parse_error;
- }
- return 0;
-parse_error:
- printf("Bad control message!\n");
- return 0;
-
-}
-
-int parallax_probe(blkif_request_t *req, blkif_t *blkif)
-{
- blkif_response_t *rsp;
- vdisk_t *img_info;
- vdi_t *vdi;
- int i, nr_vdis = 0;
-
- DPRINTF("parallax_probe: req=%p, blkif=%p\n", req, blkif);
-
- /* We expect one buffer only. */
- if ( req->nr_segments != 1 )
- goto err;
-
- /* Make sure the buffer is page-sized. */
- if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) ||
- (blkif_last_sect (req->frame_and_sects[0]) != 7) )
- goto err;
-
- /* fill the list of devices */
- for (i=0; i<VDI_HASHSZ; i++) {
- vdi = blkif->vdi_hash[i];
- while (vdi) {
- img_info = (vdisk_t *)MMAP_VADDR(ID_TO_IDX(req->id), 0);
- img_info[nr_vdis].device = vdi->vdevice;
- img_info[nr_vdis].info = 0;
- /* The -1 here accounts for the LSB in the radix tree */
- img_info[nr_vdis].capacity =
- ((1LL << (VDI_HEIGHT-1)) * SECTS_PER_NODE);
- nr_vdis++;
- vdi = vdi->next;
- }
- }
-
-
- rsp = (blkif_response_t *)req;
- rsp->id = req->id;
- rsp->operation = BLKIF_OP_PROBE;
- rsp->status = nr_vdis; /* number of disks */
-
- DPRINTF("parallax_probe: send positive response (nr_vdis=%d)\n", nr_vdis);
- return BLKTAP_RESPOND;
-err:
- rsp = (blkif_response_t *)req;
- rsp->id = req->id;
- rsp->operation = BLKIF_OP_PROBE;
- rsp->status = BLKIF_RSP_ERROR;
-
- DPRINTF("parallax_probe: send error response\n");
- return BLKTAP_RESPOND;
-}
-
-typedef struct {
- blkif_request_t *req;
- int count;
- int error;
- pthread_mutex_t mutex;
-} pending_t;
-
-#define MAX_REQUESTS 64
-pending_t pending_list[MAX_REQUESTS];
-
-struct cb_param {
- pending_t *pent;
- int segment;
- u64 sector;
- u64 vblock; /* for debug printing -- can be removed. */
-};
-
-static void read_cb(struct io_ret r, void *in_param)
-{
- struct cb_param *param = (struct cb_param *)in_param;
- pending_t *p = param->pent;
- int segment = param->segment;
- blkif_request_t *req = p->req;
- unsigned long size, offset, start;
- char *dpage, *spage;
-
- spage = IO_BLOCK(r);
- if (spage == NULL) { p->error++; goto finish; }
- dpage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), segment);
-
- /* Calculate read size and offset within the read block. */
-
- offset = (param->sector << SECTOR_SHIFT) % BLOCK_SIZE;
- size = ( blkif_last_sect (req->frame_and_sects[segment]) -
- blkif_first_sect(req->frame_and_sects[segment]) + 1
- ) << SECTOR_SHIFT;
- start = blkif_first_sect(req->frame_and_sects[segment])
- << SECTOR_SHIFT;
-
- DPRINTF("ParallaxRead: sect: %lld (%ld,%ld), "
- "vblock %llx, "
- "size %lx\n",
- param->sector, blkif_first_sect(p->req->frame_and_sects[segment]),
- blkif_last_sect (p->req->frame_and_sects[segment]),
- param->vblock, size);
-
- memcpy(dpage + start, spage + offset, size);
- freeblock(spage);
-
- /* Done the read. Now update the pending record. */
- finish:
- pthread_mutex_lock(&p->mutex);
- p->count--;
-
- if (p->count == 0) {
- blkif_response_t *rsp;
-
- rsp = (blkif_response_t *)req;
- rsp->id = req->id;
- rsp->operation = BLKIF_OP_READ;
- if (p->error == 0) {
- rsp->status = BLKIF_RSP_OKAY;
- } else {
- rsp->status = BLKIF_RSP_ERROR;
- }
- blktap_inject_response(rsp);
- }
-
- pthread_mutex_unlock(&p->mutex);
-
- free(param); /* TODO: replace with cached alloc/dealloc */
-}
-
-int parallax_read(blkif_request_t *req, blkif_t *blkif)
-{
- blkif_response_t *rsp;
- u64 vblock, gblock;
- vdi_t *vdi;
- u64 sector;
- int i;
- char *dpage, *spage;
- pending_t *pent;
-
- vdi = blkif_get_vdi(blkif, req->device);
-
- if ( vdi == NULL )
- goto err;
-
- pent = &pending_list[ID_TO_IDX(req->id)];
- pent->count = req->nr_segments;
- pent->req = req;
- pthread_mutex_init(&pent->mutex, NULL);
-
- for (i = 0; i < req->nr_segments; i++) {
- pthread_t tid;
- int ret;
- struct cb_param *p;
-
- /* Round the requested segment to a block address. */
- sector = req->sector_number + (8*i);
- vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT;
-
- /* TODO: Replace this call to malloc with a cached allocation */
- p = (struct cb_param *)malloc(sizeof(struct cb_param));
- p->pent = pent;
- p->sector = sector;
- p->segment = i;
- p->vblock = vblock; /* dbg */
-
- /* Get that block from the store. */
- vdi_read(vdi, vblock, read_cb, (void *)p);
- }
-
- return BLKTAP_STOLEN;
-
-err:
- rsp = (blkif_response_t *)req;
- rsp->id = req->id;
- rsp->operation = BLKIF_OP_READ;
- rsp->status = BLKIF_RSP_ERROR;
-
- return BLKTAP_RESPOND;
-}
-
-static void write_cb(struct io_ret r, void *in_param)
-{
- struct cb_param *param = (struct cb_param *)in_param;
- pending_t *p = param->pent;
- blkif_request_t *req = p->req;
-
- /* catch errors from the block code. */
- if (IO_INT(r) < 0) p->error++;
-
- pthread_mutex_lock(&p->mutex);
- p->count--;
-
- if (p->count == 0) {
- blkif_response_t *rsp;
-
- rsp = (blkif_response_t *)req;
- rsp->id = req->id;
- rsp->operation = BLKIF_OP_WRITE;
- if (p->error == 0) {
- rsp->status = BLKIF_RSP_OKAY;
- } else {
- rsp->status = BLKIF_RSP_ERROR;
- }
- blktap_inject_response(rsp);
- }
-
- pthread_mutex_unlock(&p->mutex);
-
- free(param); /* TODO: replace with cached alloc/dealloc */
-}
-
-int parallax_write(blkif_request_t *req, blkif_t *blkif)
-{
- blkif_response_t *rsp;
- u64 sector;
- int i, writable = 0;
- u64 vblock, gblock;
- char *spage;
- unsigned long size, offset, start;
- vdi_t *vdi;
- pending_t *pent;
-
- vdi = blkif_get_vdi(blkif, req->device);
-
- if ( vdi == NULL )
- goto err;
-
- pent = &pending_list[ID_TO_IDX(req->id)];
- pent->count = req->nr_segments;
- pent->req = req;
- pthread_mutex_init(&pent->mutex, NULL);
-
- for (i = 0; i < req->nr_segments; i++) {
- struct cb_param *p;
-
- spage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
-
- /* Round the requested segment to a block address. */
-
- sector = req->sector_number + (8*i);
- vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT;
-
- /* Calculate read size and offset within the read block. */
-
- offset = (sector << SECTOR_SHIFT) % BLOCK_SIZE;
- size = ( blkif_last_sect (req->frame_and_sects[i]) -
- blkif_first_sect(req->frame_and_sects[i]) + 1
- ) << SECTOR_SHIFT;
- start = blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT;
-
- DPRINTF("ParallaxWrite: sect: %lld (%ld,%ld), "
- "vblock %llx, gblock %llx, "
- "size %lx\n",
- sector, blkif_first_sect(req->frame_and_sects[i]),
- blkif_last_sect (req->frame_and_sects[i]),
- vblock, gblock, size);
-
- /* XXX: For now we just freak out if they try to write a */
- /* non block-sized, block-aligned page. */
-
- if ((offset != 0) || (size != BLOCK_SIZE) || (start != 0)) {
- printf("]\n] STRANGE WRITE!\n]\n");
- goto err;
- }
-
- /* TODO: Replace this call to malloc with a cached allocation */
- p = (struct cb_param *)malloc(sizeof(struct cb_param));
- p->pent = pent;
- p->sector = sector;
- p->segment = i;
- p->vblock = vblock; /* dbg */
-
- /* Issue the write to the store. */
- vdi_write(vdi, vblock, spage, write_cb, (void *)p);
- }
-
- return BLKTAP_STOLEN;
-
-err:
- rsp = (blkif_response_t *)req;
- rsp->id = req->id;
- rsp->operation = BLKIF_OP_WRITE;
- rsp->status = BLKIF_RSP_ERROR;
-
- return BLKTAP_RESPOND;
-}
-
-int parallax_request(blkif_request_t *req)
-{
- blkif_response_t *rsp;
- domid_t dom = ID_TO_DOM(req->id);
- blkif_t *blkif = blkif_find_by_handle(dom, 0);
-
- if (blkif == NULL)
- goto err;
-
- if ( req->operation == BLKIF_OP_PROBE ) {
-
- return parallax_probe(req, blkif);
-
- } else if ( req->operation == BLKIF_OP_READ ) {
-
- return parallax_read(req, blkif);
-
- } else if ( req->operation == BLKIF_OP_WRITE ) {
-
- return parallax_write(req, blkif);
-
- } else {
- printf("Unknown request message type!\n");
- /* Unknown operation */
- goto err;
- }
-
-err:
- rsp = (blkif_response_t *)req;
- rsp->operation = req->operation;
- rsp->id = req->id;
- rsp->status = BLKIF_RSP_ERROR;
- return BLKTAP_RESPOND;
-}
-
-void __init_parallax(void)
-{
- memset(blkif_hash, 0, sizeof(blkif_hash));
-}
-
-
-
-int main(int argc, char *argv[])
-{
- DPRINTF("parallax: starting.\n");
- __init_blockstore();
- DPRINTF("parallax: initialized blockstore...\n");
- init_block_async();
- DPRINTF("parallax: initialized async blocks...\n");
- __init_vdi();
- DPRINTF("parallax: initialized vdi registry etc...\n");
- __init_parallax();
- DPRINTF("parallax: initialized local stuff..\n");
-
- blktap_register_ctrl_hook("parallax_control", parallax_control);
- blktap_register_request_hook("parallax_request", parallax_request);
- DPRINTF("parallax: added ctrl + request hooks, starting listen...\n");
- blktap_listen();
-
- return 0;
-}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/radix.c
--- a/tools/blktap/radix.c Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,631 +0,0 @@
-/*
- * Radix tree for mapping (up to) 63-bit virtual block IDs to
- * 63-bit global block IDs
- *
- * Pointers within the tree set aside the least significant bit to indicate
- * whther or not the target block is writable from this node.
- *
- * The block with ID 0 is assumed to be an empty block of all zeros
- */
-
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <string.h>
-#include <pthread.h>
-#include "blockstore.h"
-#include "radix.h"
-
-#define RADIX_TREE_MAP_SHIFT 9
-#define RADIX_TREE_MAP_MASK 0x1ff
-#define RADIX_TREE_MAP_ENTRIES 512
-
-/*
-#define DEBUG
-*/
-
-/* Experimental radix cache. */
-
-static pthread_mutex_t rcache_mutex = PTHREAD_MUTEX_INITIALIZER;
-static int rcache_count = 0;
-#define RCACHE_MAX 1024
-
-typedef struct rcache_st {
- radix_tree_node *node;
- u64 id;
- struct rcache_st *hash_next;
- struct rcache_st *cache_next;
- struct rcache_st *cache_prev;
-} rcache_t;
-
-static rcache_t *rcache_head = NULL;
-static rcache_t *rcache_tail = NULL;
-
-#define RCHASH_SIZE 512ULL
-rcache_t *rcache[RCHASH_SIZE];
-#define RCACHE_HASH(_id) ((_id) & (RCHASH_SIZE - 1))
-
-void __rcache_init(void)
-{
- int i;
-
- for (i=0; i<RCHASH_SIZE; i++)
- rcache[i] = NULL;
-}
-
-
-void rcache_write(u64 id, radix_tree_node *node)
-{
- rcache_t *r, *tmp, **curs;
-
- pthread_mutex_lock(&rcache_mutex);
-
- /* Is it already in the cache? */
- r = rcache[RCACHE_HASH(id)];
-
- for (;;) {
- if (r == NULL)
- break;
- if (r->id == id)
- {
- memcpy(r->node, node, BLOCK_SIZE);
-
- /* bring to front. */
- if (r != rcache_head) {
-
- if (r == rcache_tail) {
- if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
- rcache_tail->cache_next = NULL;
- }
-
- tmp = r->cache_next;
- if (r->cache_next != NULL) r->cache_next->cache_prev
- = r->cache_prev;
- if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp;
-
- r->cache_prev = NULL;
- r->cache_next = rcache_head;
- if (rcache_head != NULL) rcache_head->cache_prev = r;
- rcache_head = r;
- }
-
-//printf("Update (%Ld)\n", r->id);
- goto done;
- }
- r = r->hash_next;
- }
-
- if ( rcache_count == RCACHE_MAX )
- {
- /* Remove an entry */
-
- r = rcache_tail;
- if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
- rcache_tail->cache_next = NULL;
- freeblock(r->node);
-
- curs = &rcache[RCACHE_HASH(r->id)];
- while ((*curs) != r)
- curs = &(*curs)->hash_next;
- *curs = r->hash_next;
-//printf("Evict (%Ld)\n", r->id);
-
- } else {
-
- r = (rcache_t *)malloc(sizeof(rcache_t));
- rcache_count++;
- }
-
- r->node = newblock();
- memcpy(r->node, node, BLOCK_SIZE);
- r->id = id;
-
- r->hash_next = rcache[RCACHE_HASH(id)];
- rcache[RCACHE_HASH(id)] = r;
-
- r->cache_prev = NULL;
- r->cache_next = rcache_head;
- if (rcache_head != NULL) rcache_head->cache_prev = r;
- rcache_head = r;
- if (rcache_tail == NULL) rcache_tail = r;
-
-//printf("Added (%Ld, %p)\n", id, r->node);
-done:
- pthread_mutex_unlock(&rcache_mutex);
-}
-
-radix_tree_node *rcache_read(u64 id)
-{
- rcache_t *r, *tmp;
- radix_tree_node *node = NULL;
-
- pthread_mutex_lock(&rcache_mutex);
-
- r = rcache[RCACHE_HASH(id)];
-
- for (;;) {
- if (r == NULL) {
-//printf("Miss (%Ld)\n", id);
- goto done;
- }
- if (r->id == id) break;
- r = r->hash_next;
- }
-
- /* bring to front. */
- if (r != rcache_head)
- {
- if (r == rcache_tail) {
- if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
- rcache_tail->cache_next = NULL;
- }
- tmp = r->cache_next;
- if (r->cache_next != NULL) r->cache_next->cache_prev = r->cache_prev;
- if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp;
-
- r->cache_prev = NULL;
- r->cache_next = rcache_head;
- if (rcache_head != NULL) rcache_head->cache_prev = r;
- rcache_head = r;
- }
-
- node = newblock();
- memcpy(node, r->node, BLOCK_SIZE);
-
-//printf("Hit (%Ld, %p)\n", id, r->node);
-done:
- pthread_mutex_unlock(&rcache_mutex);
-
- return(node);
-}
-
-
-void *rc_readblock(u64 id)
-{
- void *ret;
-
- ret = (void *)rcache_read(id);
-
- if (ret != NULL) return ret;
-
- ret = readblock(id);
-
- if (ret != NULL)
- rcache_write(id, ret);
-
- return(ret);
-}
-
-u64 rc_allocblock(void *block)
-{
- u64 ret;
-
- ret = allocblock(block);
-
- if (ret != ZERO)
- rcache_write(ret, block);
-
- return(ret);
-}
-
-int rc_writeblock(u64 id, void *block)
-{
- int ret;
-
- ret = writeblock(id, block);
- rcache_write(id, block);
-
- return(ret);
-}
-
-
-/*
- * block device interface and other helper functions
- * with these functions, block id is just a 63-bit number, with
- * no special consideration for the LSB
- */
-radix_tree_node cloneblock(radix_tree_node block);
-
-/*
- * main api
- * with these functions, the LSB of root always indicates
- * whether or not the block is writable, including the return
- * values of update and snapshot
- */
-u64 lookup(int height, u64 root, u64 key);
-u64 update(int height, u64 root, u64 key, u64 val);
-u64 snapshot(u64 root);
-
-/**
- * cloneblock: clone an existing block in memory
- * @block: the old block
- *
- * @return: new block, with LSB cleared for every entry
- */
-radix_tree_node cloneblock(radix_tree_node block) {
- radix_tree_node node = (radix_tree_node) malloc(BLOCK_SIZE);
- int i;
- if (node == NULL) {
- perror("cloneblock malloc");
- return NULL;
- }
- for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++)
- node[i] = block[i] & ONEMASK;
- return node;
-}
-
-/**
- * lookup: find a value given a key
- * @height: height in bits of the radix tree
- * @root: root node id, with set LSB indicating writable node
- * @key: key to lookup
- *
- * @return: value on success, zero on error
- */
-
-u64 lookup(int height, u64 root, u64 key) {
- radix_tree_node node;
- u64 mask = ONE;
-
- assert(key >> height == 0);
-
- /* the root block may be smaller to ensure all leaves are full */
- height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
-
- /* now carve off equal sized chunks at each step */
- for (;;) {
- u64 oldroot;
-
-#ifdef DEBUG
- printf("lookup: height=%3d root=%3Ld offset=%3d%s\n", height, root,
- (int) ((key >> height) & RADIX_TREE_MAP_MASK),
- (iswritable(root) ? "" : " (readonly)"));
-#endif
-
- if (getid(root) == ZERO)
- return ZERO;
-
- oldroot = root;
- node = (radix_tree_node) rc_readblock(getid(root));
- if (node == NULL)
- return ZERO;
-
- root = node[(key >> height) & RADIX_TREE_MAP_MASK];
- mask &= root;
- freeblock(node);
-
- if (height == 0)
- return ( root & ONEMASK ) | mask;
-
- height -= RADIX_TREE_MAP_SHIFT;
- }
-
- return ZERO;
-}
-
-/*
- * update: set a radix tree entry, doing copy-on-write as necessary
- * @height: height in bits of the radix tree
- * @root: root node id, with set LSB indicating writable node
- * @key: key to set
- * @val: value to set, s.t. radix(key)=val
- *
- * @returns: (possibly new) root id on success (with LSB=1), 0 on failure
- */
-
-u64 update(int height, u64 root, u64 key, u64 val) {
- int offset;
- u64 child;
- radix_tree_node node;
-
- /* base case--return val */
- if (height == 0)
- return val;
-
- /* the root block may be smaller to ensure all leaves are full */
- height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
- offset = (key >> height) & RADIX_TREE_MAP_MASK;
-
-#ifdef DEBUG
- printf("update: height=%3d root=%3Ld offset=%3d%s\n", height, root,
- offset, (iswritable(root)?"":" (clone)"));
-#endif
-
- /* load a block, or create a new one */
- if (root == ZERO) {
- node = (radix_tree_node) newblock();
- } else {
- node = (radix_tree_node) rc_readblock(getid(root));
-
- if (!iswritable(root)) {
- /* need to clone this node */
- radix_tree_node oldnode = node;
- node = cloneblock(node);
- freeblock(oldnode);
- root = ZERO;
- }
- }
-
- if (node == NULL) {
-#ifdef DEBUG
- printf("update: node is null!\n");
-#endif
- return ZERO;
- }
-
- child = update(height, node[offset], key, val);
-
- if (child == ZERO) {
- freeblock(node);
- return ZERO;
- } else if (child == node[offset]) {
- /* no change, so we already owned the child */
- assert(iswritable(root));
-
- freeblock(node);
- return root;
- }
-
- node[offset] = child;
-
- /* new/cloned blocks need to be saved */
- if (root == ZERO) {
- /* mark this as an owned block */
- root = rc_allocblock(node);
- if (root)
- root = writable(root);
- } else if (rc_writeblock(getid(root), node) < 0) {
- freeblock(node);
- return ZERO;
- }
-
- freeblock(node);
- return root;
-}
-
-/**
- * snapshot: create a snapshot
- * @root: old root node
- *
- * @return: new root node, 0 on error
- */
-u64 snapshot(u64 root) {
- radix_tree_node node, newnode;
-
- if ((node = rc_readblock(getid(root))) == NULL)
- return ZERO;
-
- newnode = cloneblock(node);
- freeblock(node);
- if (newnode == NULL)
- return ZERO;
-
- root = rc_allocblock(newnode);
- freeblock(newnode);
-
- if (root == ZERO)
- return ZERO;
- else
- return writable(root);
-}
-
-/**
- * collapse: collapse a parent onto a child.
- *
- * NOTE: This assumes that parent and child really are, and further that
- * there are no other children forked from this parent. (children of the
- * child are okay...)
- */
-
-int collapse(int height, u64 proot, u64 croot)
-{
- int i, numlinks, ret, total = 0;
- radix_tree_node pnode, cnode;
-
- if (height == 0) {
- height = -1; /* terminate recursion */
- } else {
- height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
- }
- numlinks = (1UL << RADIX_TREE_MAP_SHIFT);
-
- /* Terminal cases: */
-
- if ( (getid(proot) == ZERO) || (getid(croot) == ZERO) )
- return -1;
-
- /* get roots */
- if ((pnode = readblock(getid(proot))) == NULL)
- return -1;
-
- if ((cnode = readblock(getid(croot))) == NULL)
- {
- freeblock(pnode);
- return -1;
- }
-
- /* For each writable link in proot */
- for (i=0; i<numlinks; i++)
- {
- if ( pnode[i] == cnode[i] ) continue;
-
- /* collapse (next level) */
- /* if height != 0 and writable... */
- if (( height >= 0 ) && ( iswritable(pnode[i]) ) )
- {
- //printf(" %Ld is writable (i=%d).\n", getid(pnode[i]), i);
- ret = collapse(height, pnode[i], cnode[i]);
- if (ret == -1)
- {
- total = -1;
- } else {
- total += ret;
- }
- }
-
-
- }
-
- /* if plink is writable, AND clink is writable -> free plink block */
- if ( ( iswritable(proot) ) && ( iswritable(croot) ) )
- {
- releaseblock(getid(proot));
- if (ret >=0) total++;
- //printf(" Delete %Ld\n", getid(proot));
- }
-//printf("done : %Ld\n", getid(proot));
- return total;
-
-}
-
-
-void print_root(u64 root, int height, FILE *dot_f)
-{
- FILE *f;
- int i;
- radix_tree_node node;
- char *style[2] = { "", "style=bold,color=blue," };
-
- if (dot_f == NULL) {
- f = fopen("radix.dot", "w");
- if (f == NULL) {
- perror("print_root: open");
- return;
- }
-
- /* write graph preamble */
- fprintf(f, "digraph G {\n");
-
- /* add a node for this root. */
- fprintf(f, " n%Ld [%sshape=box,label=\"%Ld\"];\n",
- getid(root), style[iswritable(root)], getid(root));
- }
-
- printf("print_root(%Ld)\n", getid(root));
-
- /* base case */
- if (height == 0) {
- /* add a node and edge for each child root */
- node = (radix_tree_node) readblock(getid(root));
- if (node == NULL)
- return;
-
- for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++) {
- if (node[i] != ZERO) {
- fprintf(f, " n%Ld [%sshape=box,label=\"%Ld\"];\n",
- getid(node[i]), style[iswritable(node[i])],
- getid(node[i]));
- fprintf(f, " n%Ld -> n%Ld [label=\"%d\"]\n", getid(root),
- getid(node[i]), i);
- }
- }
- freeblock(node);
- return;
- }
-
- /* the root block may be smaller to ensure all leaves are full */
- height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
-
- if (getid(root) == ZERO)
- return;
-
- node = (radix_tree_node) readblock(getid(root));
- if (node == NULL)
- return;
-
- /* add a node and edge for each child root */
- for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++)
- if (node[i] != ZERO) {
- fprintf(f, " n%Ld [%sshape=box,label=\"%Ld\"];\n",
- getid(node[i]), style[iswritable(node[i])],
- getid(node[i]));
-
- print_root(node[i], height-RADIX_TREE_MAP_SHIFT, f);
- fprintf(f, " n%Ld -> n%Ld [label=\"%d\"]\n", getid(root),
- getid(node[i]), i);
- }
-
- freeblock(node);
-
- /* write graph postamble */
- if (dot_f == NULL) {
- fprintf(f, "}\n");
- fclose(f);
- }
-}
-
-#ifdef RADIX_STANDALONE
-
-int main(int argc, char **argv) {
- u64 key = ZERO, val = ZERO;
- u64 root = writable(2ULL);
- u64 p = ZERO, c = ZERO;
- int v;
- char buff[4096];
-
- __init_blockstore();
-
- memset(buff, 0, 4096);
- /*fp = open("radix.dat", O_RDWR | O_CREAT, 0644);
-
- if (fp < 3) {
- perror("open");
- return -1;
- }
- if (lseek(fp, 0, SEEK_END) == 0) {
- write(fp, buff, 4096);
- }*/
-
- allocblock(buff);
-
- printf("Recognized commands:\n"
- "Note: the LSB of a node number indicates if it is writable\n"
- " root <node> set root to <node>\n"
- " snapshot take a snapshot of the root\n"
- " set <key> <val> set key=val\n"
- " get <key> query key\n"
- " c <proot> <croot> collapse\n"
- " pr print tree to dot\n"
- " pf <1=verbose> print freelist\n"
- " quit\n"
- "\nroot = %Ld\n", root);
- for (;;) {
- //print_root(root, 34, NULL);
- //system("dot radix.dot -Tps -o radix.ps");
-
- printf("> ");
- fflush(stdout);
- fgets(buff, 1024, stdin);
- if (feof(stdin))
- break;
- if (sscanf(buff, " root %Ld", &root) == 1) {
- printf("root set to %Ld\n", root);
- } else if (sscanf(buff, " set %Ld %Ld", &key, &val) == 2) {
- root = update(34, root, key, val);
- printf("root = %Ld\n", root);
- } else if (sscanf(buff, " c %Ld %Ld", &p, &c) == 2) {
- v = collapse(34, p, c);
- printf("reclaimed %d blocks.\n", v);
- } else if (sscanf(buff, " get %Ld", &key) == 1) {
- val = lookup(34, root, key);
- printf("value = %Ld\n", val);
- } else if (!strcmp(buff, "quit\n")) {
- break;
- } else if (!strcmp(buff, "snapshot\n")) {
- root = snapshot(root);
- printf("new root = %Ld\n", root);
- } else if (sscanf(buff, " pr %Ld", &root) == 1) {
- print_root(root, 34, NULL);
- } else if (sscanf(buff, " pf %d", &v) == 1) {
- freelist_count(v);
- } else if (!strcmp(buff, "pf\n")) {
- freelist_count(0);
- } else {
- printf("command not recognized\n");
- }
- }
- return 0;
-}
-
-#endif
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/radix.h
--- a/tools/blktap/radix.h Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,45 +0,0 @@
-/*
- * Radix tree for mapping (up to) 63-bit virtual block IDs to
- * 63-bit global block IDs
- *
- * Pointers within the tree set aside the least significant bit to indicate
- * whther or not the target block is writable from this node.
- *
- * The block with ID 0 is assumed to be an empty block of all zeros
- */
-
-#ifndef __RADIX_H__
-#define __RADIX_H__
-
-/* I don't really like exposing these, but... */
-#define getid(x) (((x)>>1)&0x7fffffffffffffffLL)
-#define putid(x) ((x)<<1)
-#define writable(x) (((x)<<1)|1LL)
-#define iswritable(x) ((x)&1LL)
-#define ZERO 0LL
-#define ONE 1LL
-#define ONEMASK 0xffffffffffffffeLL
-
-#define RADIX_TREE_MAP_SHIFT 9
-#define RADIX_TREE_MAP_MASK 0x1ff
-#define RADIX_TREE_MAP_ENTRIES 512
-
-typedef u64 *radix_tree_node;
-
-
-/*
- * main api
- * with these functions, the LSB of root always indicates
- * whether or not the block is writable, including the return
- * values of update and snapshot
- */
-u64 lookup(int height, u64 root, u64 key);
-u64 update(int height, u64 root, u64 key, u64 val);
-u64 snapshot(u64 root);
-int collapse(int height, u64 proot, u64 croot);
-int isprivate(int height, u64 root, u64 key);
-
-
-void __rcache_init(void);
-
-#endif /* __RADIX_H__ */
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/requests-async.c
--- a/tools/blktap/requests-async.c Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,762 +0,0 @@
-/* requests-async.c
- *
- * asynchronous request dispatcher for radix access in parallax.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-#include <assert.h>
-#include <pthread.h>
-#include <err.h>
-#include <zlib.h> /* for crc32() */
-#include "requests-async.h"
-#include "vdi.h"
-#include "radix.h"
-
-#define L1_IDX(_a) (((_a) & 0x0000000007fc0000ULL) >> 18)
-#define L2_IDX(_a) (((_a) & 0x000000000003fe00ULL) >> 9)
-#define L3_IDX(_a) (((_a) & 0x00000000000001ffULL))
-
-
-#if 0
-#define DPRINTF(_f, _a...) printf ( _f , ## _a )
-#else
-#define DPRINTF(_f, _a...) ((void)0)
-#endif
-
-struct block_info {
- u32 crc;
- u32 unused;
-};
-
-struct io_req {
- enum { IO_OP_READ, IO_OP_WRITE } op;
- u64 root;
- u64 vaddr;
- int state;
- io_cb_t cb;
- void *param;
- struct radix_lock *lock;
-
- /* internal stuff: */
- struct io_ret retval;/* holds the return while we unlock. */
- char *block; /* the block to write */
- radix_tree_node radix[3];
- u64 radix_addr[3];
- struct block_info bi;
-};
-
-void clear_w_bits(radix_tree_node node)
-{
- int i;
- for (i=0; i<RADIX_TREE_MAP_ENTRIES; i++)
- node[i] = node[i] & ONEMASK;
- return;
-}
-
-void clear_L3_w_bits(radix_tree_node node)
-{
- int i;
- for (i=0; i<RADIX_TREE_MAP_ENTRIES; i+=2)
- node[i] = node[i] & ONEMASK;
- return;
-}
-
-enum states {
- /* both */
- READ_L1,
- READ_L2,
- READ_L3,
-
- /* read */
- READ_LOCKED,
- READ_DATA,
- READ_UNLOCKED,
- RETURN_ZERO,
-
- /* write */
- WRITE_LOCKED,
- WRITE_DATA,
- WRITE_L3,
- WRITE_UNLOCKED,
-
- /* L3 Zero Path */
- ALLOC_DATA_L3z,
- WRITE_L3_L3z,
-
- /* L3 Fault Path */
- ALLOC_DATA_L3f,
- WRITE_L3_L3f,
-
- /* L2 Zero Path */
- ALLOC_DATA_L2z,
- WRITE_L2_L2z,
- ALLOC_L3_L2z,
- WRITE_L2_L3z,
-
- /* L2 Fault Path */
- READ_L3_L2f,
- ALLOC_DATA_L2f,
- WRITE_L2_L2f,
- ALLOC_L3_L2f,
- WRITE_L2_L3f,
-
- /* L1 Zero Path */
- ALLOC_DATA_L1z,
- ALLOC_L3_L1z,
- ALLOC_L2_L1z,
- WRITE_L1_L1z,
-
- /* L1 Fault Path */
- READ_L2_L1f,
- READ_L3_L1f,
- ALLOC_DATA_L1f,
- ALLOC_L3_L1f,
- ALLOC_L2_L1f,
- WRITE_L1_L1f,
-
-};
-
-enum radix_offsets {
- L1 = 0,
- L2 = 1,
- L3 = 2
-};
-
-
-static void read_cb(struct io_ret ret, void *param);
-static void write_cb(struct io_ret ret, void *param);
-
-int vdi_read(vdi_t *vdi, u64 vaddr, io_cb_t cb, void *param)
-{
- struct io_req *req;
-
- if (!VALID_VADDR(vaddr)) return ERR_BAD_VADDR;
- /* Every second line in the bottom-level radix tree is used to */
- /* store crc32 values etc. We shift the vadder here to achied this. */
- vaddr <<= 1;
-
- req = (struct io_req *)malloc(sizeof (struct io_req));
- if (req == NULL) return ERR_NOMEM;
-
- req->radix[0] = req->radix[1] = req->radix[2] = NULL;
- req->op = IO_OP_READ;
- req->root = vdi->radix_root;
- req->lock = vdi->radix_lock;
- req->vaddr = vaddr;
- req->cb = cb;
- req->param = param;
- req->state = READ_LOCKED;
-
- block_rlock(req->lock, L1_IDX(vaddr), read_cb, req);
-
- return 0;
-}
-
-
-int vdi_write(vdi_t *vdi, u64 vaddr, char *block,
- io_cb_t cb, void *param)
-{
- struct io_req *req;
-
- if (!VALID_VADDR(vaddr)) return ERR_BAD_VADDR;
- /* Every second line in the bottom-level radix tree is used to */
- /* store crc32 values etc. We shift the vadder here to achied this. */
- vaddr <<= 1;
-
- req = (struct io_req *)malloc(sizeof (struct io_req));
- if (req == NULL) return ERR_NOMEM;
-
- req->radix[0] = req->radix[1] = req->radix[2] = NULL;
- req->op = IO_OP_WRITE;
- req->root = vdi->radix_root;
- req->lock = vdi->radix_lock;
- req->vaddr = vaddr;
- req->block = block;
- /* Todo: add a pseodoheader to the block to include some location */
- /* information in the CRC as well. */
- req->bi.crc = (u32) crc32(0L, Z_NULL, 0);
- req->bi.crc = (u32) crc32(req->bi.crc, block, BLOCK_SIZE);
- req->bi.unused = 0xdeadbeef;
-
- req->cb = cb;
- req->param = param;
- req->radix_addr[L1] = getid(req->root); /* for consistency */
- req->state = WRITE_LOCKED;
-
- block_wlock(req->lock, L1_IDX(vaddr), write_cb, req);
-
-
- return 0;
-}
-
-static void read_cb(struct io_ret ret, void *param)
-{
- struct io_req *req = (struct io_req *)param;
- radix_tree_node node;
- u64 idx;
- char *block;
- void *req_param;
-
- DPRINTF("read_cb\n");
- /* get record */
- switch(req->state) {
-
- case READ_LOCKED:
-
- DPRINTF("READ_LOCKED\n");
- req->state = READ_L1;
- block_read(getid(req->root), read_cb, req);
- break;
-
- case READ_L1: /* block is the radix root */
-
- DPRINTF("READ_L1\n");
- block = IO_BLOCK(ret);
- if (block == NULL) goto fail;
- node = (radix_tree_node) block;
- idx = getid( node[L1_IDX(req->vaddr)] );
- free(block);
- if ( idx == ZERO ) {
- req->state = RETURN_ZERO;
- block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
- } else {
- req->state = READ_L2;
- block_read(idx, read_cb, req);
- }
- break;
-
- case READ_L2:
-
- DPRINTF("READ_L2\n");
- block = IO_BLOCK(ret);
- if (block == NULL) goto fail;
- node = (radix_tree_node) block;
- idx = getid( node[L2_IDX(req->vaddr)] );
- free(block);
- if ( idx == ZERO ) {
- req->state = RETURN_ZERO;
- block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
- } else {
- req->state = READ_L3;
- block_read(idx, read_cb, req);
- }
- break;
-
- case READ_L3:
- {
- struct block_info *bi;
-
- DPRINTF("READ_L3\n");
- block = IO_BLOCK(ret);
- if (block == NULL) goto fail;
- node = (radix_tree_node) block;
- idx = getid( node[L3_IDX(req->vaddr)] );
- bi = (struct block_info *) &node[L3_IDX(req->vaddr) + 1];
- req->bi = *bi;
- free(block);
- if ( idx == ZERO ) {
- req->state = RETURN_ZERO;
- block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
- } else {
- req->state = READ_DATA;
- block_read(idx, read_cb, req);
- }
- break;
- }
- case READ_DATA:
- {
- u32 crc;
-
- DPRINTF("READ_DATA\n");
- block = IO_BLOCK(ret);
- if (block == NULL) goto fail;
-
- /* crc check */
- crc = (u32) crc32(0L, Z_NULL, 0);
- crc = (u32) crc32(crc, block, BLOCK_SIZE);
- if (crc != req->bi.crc) {
- /* TODO: add a retry loop here. */
- /* Do this after the cache is added -- make sure to */
- /* invalidate the bad page before reissuing the read. */
-
- warn("Bad CRC on vaddr (%Lu:%d)\n", req->vaddr, req->bi.unused);
-#ifdef PRINT_BADCRC_PAGES
- {
- int j;
- for (j=0; j<BLOCK_SIZE; j++) {
- if isprint(block[j]) {
- printf("%c", block[j]);
- } else {
- printf(".");
- }
- if ((j % 64) == 0) printf("\n");
- }
- }
-#endif /* PRINT_BADCRC_PAGES */
-
- /* fast and loose for the moment. */
- /* goto fail; */
- }
-
- req->retval = ret;
- req->state = READ_UNLOCKED;
- block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
- break;
- }
- case READ_UNLOCKED:
- {
- struct io_ret r;
- io_cb_t cb;
- DPRINTF("READ_UNLOCKED\n");
- req_param = req->param;
- r = req->retval;
- cb = req->cb;
- free(req);
- cb(r, req_param);
- break;
- }
-
- case RETURN_ZERO:
- {
- struct io_ret r;
- io_cb_t cb;
- DPRINTF("RETURN_ZERO\n");
- req_param = req->param;
- cb = req->cb;
- free(req);
- r.type = IO_BLOCK_T;
- r.u.b = newblock();
- cb(r, req_param);
- break;
- }
-
- default:
- DPRINTF("*** Write: Bad state! (%d) ***\n", req->state);
- goto fail;
- }
-
- return;
-
- fail:
- {
- struct io_ret r;
- io_cb_t cb;
- DPRINTF("asyn_read had a read error.\n");
- req_param = req->param;
- r = ret;
- cb = req->cb;
- free(req);
- cb(r, req_param);
- }
-
-
-}
-
-static void write_cb(struct io_ret r, void *param)
-{
- struct io_req *req = (struct io_req *)param;
- radix_tree_node node;
- u64 a, addr;
- void *req_param;
- struct block_info *bi;
-
- switch(req->state) {
-
- case WRITE_LOCKED:
-
- DPRINTF("WRITE_LOCKED (%llu)\n", L1_IDX(req->vaddr));
- req->state = READ_L1;
- block_read(getid(req->root), write_cb, req);
- break;
-
- case READ_L1: /* block is the radix root */
-
- DPRINTF("READ_L1\n");
- node = (radix_tree_node) IO_BLOCK(r);
- if (node == NULL) goto fail;
- a = node[L1_IDX(req->vaddr)];
- addr = getid(a);
-
- req->radix_addr[L2] = addr;
- req->radix[L1] = node;
-
- if ( addr == ZERO ) {
- /* L1 empty subtree: */
- req->state = ALLOC_DATA_L1z;
- block_alloc( req->block, write_cb, req );
- } else if ( !iswritable(a) ) {
- /* L1 fault: */
- req->state = READ_L2_L1f;
- block_read( addr, write_cb, req );
- } else {
- req->state = READ_L2;
- block_read( addr, write_cb, req );
- }
- break;
-
- case READ_L2:
-
- DPRINTF("READ_L2\n");
- node = (radix_tree_node) IO_BLOCK(r);
- if (node == NULL) goto fail;
- a = node[L2_IDX(req->vaddr)];
- addr = getid(a);
-
- req->radix_addr[L3] = addr;
- req->radix[L2] = node;
-
- if ( addr == ZERO ) {
- /* L2 empty subtree: */
- req->state = ALLOC_DATA_L2z;
- block_alloc( req->block, write_cb, req );
- } else if ( !iswritable(a) ) {
- /* L2 fault: */
- req->state = READ_L3_L2f;
- block_read( addr, write_cb, req );
- } else {
- req->state = READ_L3;
- block_read( addr, write_cb, req );
- }
- break;
-
- case READ_L3:
-
- DPRINTF("READ_L3\n");
- node = (radix_tree_node) IO_BLOCK(r);
- if (node == NULL) goto fail;
- a = node[L3_IDX(req->vaddr)];
- addr = getid(a);
-
- req->radix[L3] = node;
-
- if ( addr == ZERO ) {
- /* L3 fault: */
- req->state = ALLOC_DATA_L3z;
- block_alloc( req->block, write_cb, req );
- } else if ( !iswritable(a) ) {
- /* L3 fault: */
- req->state = ALLOC_DATA_L3f;
- block_alloc( req->block, write_cb, req );
- } else {
- req->state = WRITE_DATA;
- block_write( addr, req->block, write_cb, req );
- }
- break;
-
- case WRITE_DATA:
-
- DPRINTF("WRITE_DATA\n");
- /* The L3 radix points to the correct block, we just need to */
- /* update the crc. */
- if (IO_INT(r) < 0) goto fail;
- bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
- req->bi.unused = 101;
- *bi = req->bi;
- req->state = WRITE_L3;
- block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req);
- break;
-
- /* L3 Zero Path: */
-
- case ALLOC_DATA_L3z:
-
- DPRINTF("ALLOC_DATA_L3z\n");
- addr = IO_ADDR(r);
- a = writable(addr);
- req->radix[L3][L3_IDX(req->vaddr)] = a;
- bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
- req->bi.unused = 102;
- *bi = req->bi;
- req->state = WRITE_L3_L3z;
- block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req);
- break;
-
- /* L3 Fault Path: */
-
- case ALLOC_DATA_L3f:
-
- DPRINTF("ALLOC_DATA_L3f\n");
- addr = IO_ADDR(r);
- a = writable(addr);
- req->radix[L3][L3_IDX(req->vaddr)] = a;
- bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
- req->bi.unused = 103;
- *bi = req->bi;
- req->state = WRITE_L3_L3f;
- block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req);
- break;
-
- /* L2 Zero Path: */
-
- case ALLOC_DATA_L2z:
-
- DPRINTF("ALLOC_DATA_L2z\n");
- addr = IO_ADDR(r);
- a = writable(addr);
- req->radix[L3] = newblock();
- req->radix[L3][L3_IDX(req->vaddr)] = a;
- bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
- req->bi.unused = 104;
- *bi = req->bi;
- req->state = ALLOC_L3_L2z;
- block_alloc( (char*)req->radix[L3], write_cb, req );
- break;
-
- case ALLOC_L3_L2z:
-
- DPRINTF("ALLOC_L3_L2z\n");
- addr = IO_ADDR(r);
- a = writable(addr);
- req->radix[L2][L2_IDX(req->vaddr)] = a;
- req->state = WRITE_L2_L2z;
- block_write(req->radix_addr[L2], (char*)req->radix[L2], write_cb, req);
- break;
-
- /* L2 Fault Path: */
-
- case READ_L3_L2f:
-
- DPRINTF("READ_L3_L2f\n");
- node = (radix_tree_node) IO_BLOCK(r);
- clear_L3_w_bits(node);
- if (node == NULL) goto fail;
- a = node[L2_IDX(req->vaddr)];
- addr = getid(a);
-
- req->radix[L3] = node;
- req->state = ALLOC_DATA_L2f;
- block_alloc( req->block, write_cb, req );
- break;
-
- case ALLOC_DATA_L2f:
-
- DPRINTF("ALLOC_DATA_L2f\n");
- addr = IO_ADDR(r);
- a = writable(addr);
- req->radix[L3][L3_IDX(req->vaddr)] = a;
- bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
- req->bi.unused = 105;
- *bi = req->bi;
- req->state = ALLOC_L3_L2f;
- block_alloc( (char*)req->radix[L3], write_cb, req );
- break;
-
- case ALLOC_L3_L2f:
-
- DPRINTF("ALLOC_L3_L2f\n");
- addr = IO_ADDR(r);
- a = writable(addr);
- req->radix[L2][L2_IDX(req->vaddr)] = a;
- req->state = WRITE_L2_L2f;
- block_write(req->radix_addr[L2], (char*)req->radix[L2], write_cb, req);
- break;
-
- /* L1 Zero Path: */
-
- case ALLOC_DATA_L1z:
-
- DPRINTF("ALLOC_DATA_L1z\n");
- addr = IO_ADDR(r);
- a = writable(addr);
- req->radix[L3] = newblock();
- req->radix[L3][L3_IDX(req->vaddr)] = a;
- bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
- req->bi.unused = 106;
- *bi = req->bi;
- req->state = ALLOC_L3_L1z;
- block_alloc( (char*)req->radix[L3], write_cb, req );
- break;
-
- case ALLOC_L3_L1z:
-
- DPRINTF("ALLOC_L3_L1z\n");
- addr = IO_ADDR(r);
- a = writable(addr);
- req->radix[L2] = newblock();
- req->radix[L2][L2_IDX(req->vaddr)] = a;
- req->state = ALLOC_L2_L1z;
- block_alloc( (char*)req->radix[L2], write_cb, req );
- break;
-
- case ALLOC_L2_L1z:
-
- DPRINTF("ALLOC_L2_L1z\n");
- addr = IO_ADDR(r);
- a = writable(addr);
- req->radix[L1][L1_IDX(req->vaddr)] = a;
- req->state = WRITE_L1_L1z;
- block_write(req->radix_addr[L1], (char*)req->radix[L1], write_cb, req);
- break;
-
- /* L1 Fault Path: */
-
- case READ_L2_L1f:
-
- DPRINTF("READ_L2_L1f\n");
- node = (radix_tree_node) IO_BLOCK(r);
- clear_w_bits(node);
- if (node == NULL) goto fail;
- a = node[L2_IDX(req->vaddr)];
- addr = getid(a);
-
- req->radix_addr[L3] = addr;
- req->radix[L2] = node;
-
- if (addr == ZERO) {
- /* nothing below L2, create an empty L3 and alloc data. */
- /* (So skip READ_L3_L1f.) */
- req->radix[L3] = newblock();
- req->state = ALLOC_DATA_L1f;
- block_alloc( req->block, write_cb, req );
- } else {
- req->state = READ_L3_L1f;
- block_read( addr, write_cb, req );
- }
- break;
-
- case READ_L3_L1f:
-
- DPRINTF("READ_L3_L1f\n");
- node = (radix_tree_node) IO_BLOCK(r);
- clear_L3_w_bits(node);
- if (node == NULL) goto fail;
- a = node[L2_IDX(req->vaddr)];
- addr = getid(a);
-
- req->radix[L3] = node;
- req->state = ALLOC_DATA_L1f;
- block_alloc( req->block, write_cb, req );
- break;
-
- case ALLOC_DATA_L1f:
-
- DPRINTF("ALLOC_DATA_L1f\n");
- addr = IO_ADDR(r);
- a = writable(addr);
- req->radix[L3][L3_IDX(req->vaddr)] = a;
- bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
- req->bi.unused = 107;
- *bi = req->bi;
- req->state = ALLOC_L3_L1f;
- block_alloc( (char*)req->radix[L3], write_cb, req );
- break;
-
- case ALLOC_L3_L1f:
-
- DPRINTF("ALLOC_L3_L1f\n");
- addr = IO_ADDR(r);
- a = writable(addr);
- req->radix[L2][L2_IDX(req->vaddr)] = a;
- req->state = ALLOC_L2_L1f;
- block_alloc( (char*)req->radix[L2], write_cb, req );
- break;
-
- case ALLOC_L2_L1f:
-
- DPRINTF("ALLOC_L2_L1f\n");
- addr = IO_ADDR(r);
- a = writable(addr);
- req->radix[L1][L1_IDX(req->vaddr)] = a;
- req->state = WRITE_L1_L1f;
- block_write(req->radix_addr[L1], (char*)req->radix[L1], write_cb, req);
- break;
-
- case WRITE_L3:
- case WRITE_L3_L3z:
- case WRITE_L3_L3f:
- case WRITE_L2_L2z:
- case WRITE_L2_L2f:
- case WRITE_L1_L1z:
- case WRITE_L1_L1f:
- {
- int i;
- DPRINTF("DONE\n");
- /* free any saved node vals. */
- for (i=0; i<3; i++)
- if (req->radix[i] != 0) free(req->radix[i]);
- req->retval = r;
- req->state = WRITE_UNLOCKED;
- block_wunlock(req->lock, L1_IDX(req->vaddr), write_cb, req);
- break;
- }
- case WRITE_UNLOCKED:
- {
- struct io_ret r;
- io_cb_t cb;
- DPRINTF("WRITE_UNLOCKED!\n");
- req_param = req->param;
- r = req->retval;
- cb = req->cb;
- free(req);
- cb(r, req_param);
- break;
- }
-
- default:
- DPRINTF("*** Write: Bad state! (%d) ***\n", req->state);
- goto fail;
- }
-
- return;
-
- fail:
- {
- struct io_ret r;
- io_cb_t cb;
- int i;
-
- DPRINTF("asyn_write had a read error mid-way.\n");
- req_param = req->param;
- cb = req->cb;
- r.type = IO_INT_T;
- r.u.i = -1;
- /* free any saved node vals. */
- for (i=0; i<3; i++)
- if (req->radix[i] != 0) free(req->radix[i]);
- free(req);
- cb(r, req_param);
- }
-}
-
-char *vdi_read_s(vdi_t *vdi, u64 vaddr)
-{
- pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
- char *block = NULL;
- int ret;
-
- void reads_cb(struct io_ret r, void *param)
- {
- block = IO_BLOCK(r);
- pthread_mutex_unlock((pthread_mutex_t *)param);
- }
-
- pthread_mutex_lock(&m);
- ret = vdi_read(vdi, vaddr, reads_cb, &m);
-
- if (ret == 0) pthread_mutex_lock(&m);
-
- return block;
-}
-
-
-int vdi_write_s(vdi_t *vdi, u64 vaddr, char *block)
-{
- pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
- int ret, result;
-
- void writes_cb(struct io_ret r, void *param)
- {
- result = IO_INT(r);
- pthread_mutex_unlock((pthread_mutex_t *)param);
- }
-
- pthread_mutex_lock(&m);
- ret = vdi_write(vdi, vaddr, block, writes_cb, &m);
-
- if (ret == 0) pthread_mutex_lock(&m);
-
- return result;
-}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/requests-async.h
--- a/tools/blktap/requests-async.h Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,29 +0,0 @@
-#ifndef _REQUESTSASYNC_H_
-#define _REQUESTSASYNC_H_
-
-#include "block-async.h"
-#include "blockstore.h" /* for newblock etc. */
-
-/*
-#define BLOCK_SIZE 4096
-#define ZERO 0ULL
-#define getid(x) (((x)>>1)&0x7fffffffffffffffLLU)
-#define iswritable(x) (((x) & 1LLU) != 0)
-#define writable(x) (((x) << 1) | 1LLU)
-#define readonly(x) ((u64)((x) << 1))
-*/
-
-#define VADDR_MASK 0x0000000003ffffffLLU /* 26-bits = 256Gig */
-#define VALID_VADDR(x) (((x) & VADDR_MASK) == (x))
-
-int vdi_read (vdi_t *vdi, u64 vaddr, io_cb_t cb, void *param);
-int vdi_write(vdi_t *vdi, u64 vaddr, char *block, io_cb_t cb, void *param);
-
-/* synchronous versions: */
-char *vdi_read_s (vdi_t *vdi, u64 vaddr);
-int vdi_write_s(vdi_t *vdi, u64 vaddr, char *block);
-
-#define ERR_BAD_VADDR -1
-#define ERR_NOMEM -2
-
-#endif //_REQUESTSASYNC_H_
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/snaplog.c
--- a/tools/blktap/snaplog.c Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,238 +0,0 @@
-/**************************************************************************
- *
- * snaplog.c
- *
- * Snapshot log on-disk data structure.
- *
- */
-
- /* VDI histories are made from chains of snapshot logs. These logs record
- * the (radix) root and timestamp of individual snapshots.
- *
- * creation of a new VDI involves 'forking' a snapshot log, by creating a
- * new, empty log (in a new VDI) and parenting it off of a record in an
- * existing snapshot log.
- *
- * snapshot log blocks have at most one writer.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/time.h>
-#include "blockstore.h"
-#include "snaplog.h"
-
-
-
-snap_block_t *snap_get_block(u64 block)
-{
- snap_block_t *blk = (snap_block_t *)readblock(block);
-
- if ( blk == NULL)
- return NULL;
- if ( blk->hdr.magic != SNAP_MAGIC ) {
- freeblock(blk);
- return NULL;
- }
-
- return blk;
-}
-
-int snap_get_id(snap_id_t *id, snap_rec_t *target)
-{
- snap_block_t *blk;
-
- if ( id == NULL )
- return -1;
-
- blk = snap_get_block(id->block);
-
- if ( blk == NULL )
- return -1;
-
- if ( id->index > blk->hdr.nr_entries ) {
- freeblock(blk);
- return -1;
- }
-
- *target = blk->snaps[id->index];
- freeblock(blk);
- return 0;
-}
-
-int __snap_block_create(snap_id_t *parent_id, snap_id_t *fork_id,
- snap_id_t *new_id)
-{
- snap_rec_t parent_rec, fork_rec;
- snap_block_t *blk, *pblk;
- /*
- if ( (parent_id != NULL) && (snap_get_id(parent_id, &parent_rec) != 0) )
- return -1;
-
- if ( (fork_id != NULL) && (snap_get_id(fork_id, &fork_rec) != 0) )
- return -1;
-*/
- blk = (snap_block_t *)newblock();
- blk->hdr.magic = SNAP_MAGIC;
- blk->hdr.nr_entries = 0;
- blk->hdr.log_entries = 0;
- blk->hdr.immutable = 0;
-
- if ( (parent_id != NULL)
- && (parent_id->block != fork_id->block)
- && (parent_id->block != 0)) {
-
- pblk = snap_get_block(parent_id->block);
- blk->hdr.log_entries = pblk->hdr.log_entries;
- freeblock(pblk);
- }
-
- if (parent_id != NULL) {
- blk->hdr.parent_block = *parent_id;
- blk->hdr.fork_block = *fork_id;
- } else {
- blk->hdr.parent_block = null_snap_id;
- blk->hdr.fork_block = null_snap_id;
- }
-
- new_id->index = 0;
- new_id->block = allocblock(blk);
- freeblock(blk);
- if (new_id->block == 0)
- return -1;
-
- return 0;
-}
-
-int snap_block_create(snap_id_t *parent_id, snap_id_t *new_id)
-{
- return __snap_block_create(parent_id, parent_id, new_id);
-}
-
-int snap_append(snap_id_t *old_id, snap_rec_t *rec, snap_id_t *new_id)
-{
- snap_id_t id = *old_id;
- snap_block_t *blk = snap_get_block(id.block);
-
- if ( rec->deleted == 1 ) {
- printf("Attempt to append a deleted snapshot!\n");
- return -1;
- }
-
- if ( blk->hdr.immutable != 0 ) {
- printf("Attempt to snap an immutable snap block!\n");
- return -1;
- }
-
- new_id->block = id.block;
-
- if (blk->hdr.nr_entries == SNAPS_PER_BLOCK) {
- int ret;
-
- id.index--; /* make id point to the last full record */
-
- ret = __snap_block_create(&id, &blk->hdr.fork_block, new_id);
- if ( ret != 0 ) {
- freeblock(blk);
- return -1;
- }
-
- blk->hdr.immutable = 1;
- writeblock(id.block, blk);
- freeblock(blk);
- blk = snap_get_block(new_id->block);
- id = *new_id;
- }
-
- blk->snaps[blk->hdr.nr_entries] = *rec;
- blk->hdr.nr_entries++;
- blk->hdr.log_entries++;
- new_id->index = blk->hdr.nr_entries;
- //printf("snap: %u %u\n", blk->hdr.nr_entries, blk->hdr.log_entries);
- writeblock(id.block, blk);
- freeblock(blk);
- return 0;
-}
-
-int snap_collapse(int height, snap_id_t *p_id, snap_id_t *c_id)
-{
- snap_block_t *p_blk, *c_blk, *blk;
- snap_rec_t *p_rec, *c_rec;
- int ret = -1;
-
- p_blk = snap_get_block(p_id->block);
-
- if (p_blk == NULL) return(-1);
-
- if (c_id->block == p_id->block)
- {
- c_blk = p_blk;
- } else {
- c_blk = snap_get_block(c_id->block);
- }
-
- if (p_blk == NULL) {
- freeblock(p_blk);
- return(-1);
- }
-
- /* parent and child must not be deleted. */
- p_rec = &p_blk->snaps[p_id->index];
- c_rec = &c_blk->snaps[c_id->index];
- /*
- if ( (p_rec->deleted == 1) || (c_rec->deleted == 1) ) {
- printf("One of those snaps is already deleted.\n");
- goto done;
- }
- */
- /* first non-deleted thing in the log before child must be parent. */
-
- /* XXX todo: text the range here for delete (and eventually fork) bits) */
- /* for now, snaps must be consecutive, on the same log page: */
-
- if ((p_id->block != c_id->block) || (p_id->index != c_id->index-1))
- {
- printf("Deleting non-consecutive snaps is not done yet.\n");
- goto done;
- }
-
- /* mark parent as deleted XXX: may need to lock parent block here.*/
- p_rec->deleted = 1;
- writeblock(p_id->block, p_blk);
-
- /* delete the parent */
- printf("collapse(%Ld, %Ld)\n", p_rec->radix_root, c_rec->radix_root);
- ret = collapse(height, p_rec->radix_root, c_rec->radix_root);
-
- /* return the number of blocks reclaimed. */
-
-done:
- if (c_blk != p_blk) freeblock(c_blk);
- freeblock(p_blk);
-
- return(ret);
-}
-
-void snap_print_history(snap_id_t *snap_id)
-{
- snap_id_t id = *snap_id;
- unsigned int idx = id.index;
- snap_block_t *new_blk, *blk = snap_get_block(id.block);
-
- while ( blk ) {
- printf("[Snap block %Ld]:\n", id.block);
- do {
- printf(" %03u: root: %Ld ts: %ld.%ld\n", idx,
- blk->snaps[idx].radix_root,
- blk->snaps[idx].timestamp.tv_sec,
- blk->snaps[idx].timestamp.tv_usec);
- } while (idx-- != 0);
-
- id = blk->hdr.parent_block;
- if (id.block != 0) {
- new_blk = snap_get_block(id.block);
- }
- freeblock(blk);
- blk = new_blk;
- }
-}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/snaplog.h
--- a/tools/blktap/snaplog.h Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,61 +0,0 @@
-/**************************************************************************
- *
- * snaplog.h
- *
- * Snapshot log on-disk data structure.
- *
- */
-
-#include "radix.h"
-#include "blockstore.h" /* for BLOCK_SIZE */
-
-#ifndef __SNAPLOG_H__
-#define __SNAPLOG_H__
-
-typedef struct snap_id {
- u64 block;
- unsigned int index;
-} snap_id_t;
-
-typedef struct snap_rec {
- u64 radix_root;
- struct timeval timestamp;
- /* flags: */
- unsigned deleted:1;
-} snap_rec_t;
-
-
-int snap_block_create(snap_id_t *parent_id, snap_id_t *new_id);
-int snap_append(snap_id_t *id, snap_rec_t *rec, snap_id_t *new_id);
-int snap_collapse(int height, snap_id_t *p_id, snap_id_t *c_id);
-void snap_print_history(snap_id_t *snap_id);
-int snap_get_id(snap_id_t *id, snap_rec_t *target);
-
-
-/* exported for vdi debugging */
-#define SNAP_MAGIC 0xff00ff0aa0ff00ffLL
-
-static const snap_id_t null_snap_id = { 0, 0 };
-
-typedef struct snap_block_hdr {
- u64 magic;
- snap_id_t parent_block; /* parent block within this chain */
- snap_id_t fork_block; /* where this log was forked */
- unsigned log_entries; /* total entries since forking */
- unsigned short nr_entries; /* entries in snaps[] */
- unsigned short immutable; /* has this snap page become immutable? */
-} snap_block_hdr_t;
-
-
-#define SNAPS_PER_BLOCK \
- ((BLOCK_SIZE - sizeof(snap_block_hdr_t)) / sizeof(snap_rec_t))
-
-typedef struct snap_block {
- snap_block_hdr_t hdr;
- snap_rec_t snaps[SNAPS_PER_BLOCK];
-} snap_block_t;
-
-
-snap_block_t *snap_get_block(u64 block);
-
-#endif /* __SNAPLOG_H__ */
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/vdi.c
--- a/tools/blktap/vdi.c Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,367 +0,0 @@
-/**************************************************************************
- *
- * vdi.c
- *
- * Virtual Disk Image (VDI) Interfaces
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <fcntl.h>
-#include <string.h>
-#include <sys/time.h>
-#include <pthread.h>
-#include "blockstore.h"
-#include "block-async.h"
-#include "requests-async.h"
-#include "radix.h"
-#include "vdi.h"
-
-#define VDI_REG_BLOCK 2LL
-#define VDI_RADIX_ROOT writable(3)
-
-#if 0
-#define DPRINTF(_f, _a...) printf ( _f , ## _a )
-#else
-#define DPRINTF(_f, _a...) ((void)0)
-#endif
-
-/* I haven't decided about this registry stuff, so this is just a really
- * quick lash-up so that there is some way to track VDIs.
- *
- * (Most vdi access should be with a direct handle to the block, so this
- * registry is just for start-of-day lookup and other control operations.)
- */
-
-vdi_registry_t *create_vdi_registry(void)
-{
- vdi_registry_t *reg = (vdi_registry_t *)newblock();
-
- if (reg == NULL)
- return NULL;
-
- /* zero-fill the vdi radix root while we have an empty block. */
- writeblock(VDI_RADIX_ROOT, (void *)reg);
-
-
- DPRINTF("[vdi.c] Creating VDI registry!\n");
- reg->magic = VDI_REG_MAGIC;
- reg->nr_vdis = 0;
-
- writeblock(VDI_REG_BLOCK, (void *)reg);
-
- return reg;
-}
-
-vdi_registry_t *get_vdi_registry(void)
-{
- vdi_registry_t *vdi_reg = (vdi_registry_t *)readblock(VDI_REG_BLOCK);
-
- if ( vdi_reg == NULL )
- vdi_reg = create_vdi_registry();
-
- if ( vdi_reg->magic != VDI_REG_MAGIC ) {
- freeblock(vdi_reg);
- return NULL;
- }
-
- return vdi_reg;
-}
-
-
-vdi_t *vdi_create(snap_id_t *parent_snap, char *name)
-{
- int ret;
- vdi_t *vdi;
- vdi_registry_t *vdi_reg;
- snap_rec_t snap_rec;
-
- /* create a vdi struct */
- vdi = newblock();
- if (vdi == NULL)
- return NULL;
-
- if ( snap_get_id(parent_snap, &snap_rec) == 0 ) {
- vdi->radix_root = snapshot(snap_rec.radix_root);
- } else {
- vdi->radix_root = allocblock((void *)vdi); /* vdi is just zeros here */
- vdi->radix_root = writable(vdi->radix_root); /* grr. */
- }
-
- /* create a snapshot log, and add it to the vdi struct */
-
- ret = snap_block_create(parent_snap, &vdi->snap);
- if ( ret != 0 ) {
- DPRINTF("Error getting snap block in vdi_create.\n");
- freeblock(vdi);
- return NULL;
- }
-
- /* append the vdi to the registry, fill block and id. */
- /* implicit allocation means we have to write the vdi twice here. */
- vdi_reg = get_vdi_registry();
- if ( vdi_reg == NULL ) {
- freeblock(vdi);
- return NULL;
- }
-
- vdi->block = allocblock((void *)vdi);
- vdi->id = vdi_reg->nr_vdis++;
- strncpy(vdi->name, name, VDI_NAME_SZ);
- vdi->name[VDI_NAME_SZ] = '\0';
- vdi->radix_lock = NULL; /* for tidiness */
- writeblock(vdi->block, (void *)vdi);
-
- update(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi->id, vdi->block);
- writeblock(VDI_REG_BLOCK, (void *)vdi_reg);
- freeblock(vdi_reg);
-
- vdi->radix_lock = (struct radix_lock *)malloc(sizeof(struct radix_lock));
- if (vdi->radix_lock == NULL)
- {
- perror("couldn't malloc radix_lock for new vdi!");
- freeblock(vdi);
- return NULL;
- }
- radix_lock_init(vdi->radix_lock);
-
- return vdi;
-}
-
-/* vdi_get and vdi_put currently act more like alloc/free -- they don't
- * do refcount-based allocation.
- */
-vdi_t *vdi_get(u64 vdi_id)
-{
- u64 vdi_blk;
- vdi_t *vdi;
-
- vdi_blk = lookup(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi_id);
-
- if ( vdi_blk == 0 )
- return NULL;
-
- vdi = (vdi_t *)readblock(vdi_blk);
-
- vdi->radix_lock = (struct radix_lock *)malloc(sizeof(struct radix_lock));
- if (vdi->radix_lock == NULL)
- {
- perror("couldn't malloc radix_lock for new vdi!");
- freeblock(vdi);
- return NULL;
- }
- radix_lock_init(vdi->radix_lock);
-
- return vdi;
-}
-
-void vdi_put(vdi_t *vdi)
-{
- free(vdi->radix_lock);
- freeblock(vdi);
-}
-
-void vdi_snapshot(vdi_t *vdi)
-{
- snap_rec_t rec;
- int ret;
-
- rec.radix_root = vdi->radix_root;
- gettimeofday(&rec.timestamp, NULL);
- rec.deleted = 0;
-
- vdi->radix_root = snapshot(vdi->radix_root);
- ret = snap_append(&vdi->snap, &rec, &vdi->snap);
- if ( ret != 0 ) {
- printf("snap_append returned failure\n");
- return;
- }
- writeblock(vdi->block, vdi);
-}
-
-int __init_vdi()
-{
- /* sneak this in here for the moment. */
- __rcache_init();
-
- /* force the registry to be created if it doesn't exist. */
- vdi_registry_t *vdi_reg = get_vdi_registry();
- if (vdi_reg == NULL) {
- printf("[vdi.c] Couldn't get/create a VDI registry!\n");
- return -1;
- }
- freeblock(vdi_reg);
-
-
- return 0;
-}
-
-#ifdef VDI_STANDALONE
-
-#define TEST_VDIS 50
-#define NR_ITERS 50000
-#define FORK_POINTS 200
-#define INIT_VDIS 3
-#define INIT_SNAPS 40
-
-/* These must be of decreasing size: */
-#define NEW_FORK (RAND_MAX-(RAND_MAX/1000))
-#define NEW_ROOT_VDI (RAND_MAX-((RAND_MAX/1000)*2))
-#define NEW_FORK_VDI (RAND_MAX-((RAND_MAX/1000)*3))
-
-#define GRAPH_DOT_FILE "vdi.dot"
-#define GRAPH_PS_FILE "vdi.ps"
-
-
-typedef struct sh_st {
- snap_id_t id;
- struct sh_st *next;
-} sh_t;
-
-#define SNAP_HASHSZ 1024
-sh_t *node_hash[SNAP_HASHSZ];
-#define SNAP_HASH(_id) (((int)(_id)->block^(_id)->index)%SNAP_HASHSZ)
-
-#define SNAPID_EQUAL(_a,_b) \
- (((_a)->block==(_b)->block) && ((_a)->index==(_b)->index))
-int sh_check_and_add(snap_id_t *id)
-{
- sh_t **s = &node_hash[SNAP_HASH(id)];
-
- while (*s != NULL) {
- if (SNAPID_EQUAL(&((*s)->id), id))
- return 1;
- *s = (*s)->next;
- }
-
- *s = (sh_t *)malloc(sizeof(sh_t));
- (*s)->id = *id;
- (*s)->next = NULL;
-
- return 0;
-}
-
-int main(int argc, char *argv[])
-{
- vdi_t *vdi_list[TEST_VDIS];
- snap_id_t id, fork_points[FORK_POINTS];
- int nr_vdis = 0, nr_forks = 0;
- int i, j, r;
- FILE *f;
- char name[VDI_NAME_SZ];
-
- __init_blockstore();
- __init_vdi();
-
- printf("[o] Generating seed VDIs. (%d VDIs)\n", INIT_VDIS);
-
- for (i=0; i<INIT_VDIS; i++) {
- r=rand();
-
- sprintf(name, "VDI Number %d", nr_vdis);
- vdi_list[i] = vdi_create(NULL, name);
- for (j=0; j<(r%INIT_SNAPS); j++)
- vdi_snapshot(vdi_list[i]);
- fork_points[i] = vdi_list[i]->snap;
- nr_vdis++;
- nr_forks++;
- }
-
- printf("[o] Running a random workload. (%d iterations)\n", NR_ITERS);
-
- for (i=0; i<NR_ITERS; i++) {
- r = rand();
-
- if ( r > NEW_FORK ) {
- if ( nr_forks > FORK_POINTS )
- continue;
- id = vdi_list[r%nr_vdis]->snap;
- if ( ( id.block == 0 ) || ( id.index == 0 ) )
- continue;
- id.index--;
- fork_points[nr_forks++] = id;
-
- } else if ( r > NEW_ROOT_VDI ) {
-
- if ( nr_vdis == TEST_VDIS )
- continue;
-
- sprintf(name, "VDI Number %d.", nr_vdis);
- vdi_list[nr_vdis++] = vdi_create(NULL, name);
-
- } else if ( r > NEW_FORK_VDI ) {
-
- if ( nr_vdis == TEST_VDIS )
- continue;
-
- sprintf(name, "VDI Number %d.", nr_vdis);
- vdi_list[nr_vdis++] = vdi_create(&fork_points[r%nr_forks], name);
-
- } else /* SNAPSHOT */ {
-
- vdi_snapshot(vdi_list[r%nr_vdis]);
-
- }
- }
-
- /* now dump it out to a dot file. */
- printf("[o] Dumping state to a dot graph. (%d VDIs)\n", nr_vdis);
-
- f = fopen(GRAPH_DOT_FILE, "w");
-
- /* write graph preamble */
- fprintf(f, "digraph G {\n");
- fprintf(f, " rankdir=LR\n");
-
- for (i=0; i<nr_vdis; i++) {
- char oldnode[255];
- snap_block_t *blk;
- snap_id_t id = vdi_list[i]->snap;
- int nr_snaps, done=0;
-
- /* add a node for the id */
-printf("vdi: %d\n", i);
- fprintf(f, " n%Ld%d
[color=blue,shape=box,label=\"%s\\nb:%Ld\\nidx:%d\"]\n",
- id.block, id.index, vdi_list[i]->name,
- id.block, id.index);
- sprintf(oldnode, "n%Ld%d", id.block, id.index);
-
- while (id.block != 0) {
- blk = snap_get_block(id.block);
- nr_snaps = blk->hdr.log_entries - (blk->hdr.nr_entries - id.index);
- id = blk->hdr.fork_block;
-
- done = sh_check_and_add(&id);
-
- /* add a node for the fork_id */
- if (!done) {
- fprintf(f, " n%Ld%d [shape=box,label=\"b:%Ld\\nidx:%d\"]\n",
- id.block, id.index,
- id.block, id.index);
- }
-
- /* add an edge between them */
- fprintf(f, " n%Ld%d -> %s [label=\"%u snapshots\"]\n",
- id.block, id.index, oldnode, nr_snaps);
- sprintf(oldnode, "n%Ld%d", id.block, id.index);
- freeblock(blk);
-
- if (done) break;
- }
- }
-
- /* write graph postamble */
- fprintf(f, "}\n");
- fclose(f);
-
- printf("[o] Generating postscript graph. (%s)\n", GRAPH_PS_FILE);
- {
- char cmd[255];
- sprintf(cmd, "dot %s -Tps -o %s", GRAPH_DOT_FILE, GRAPH_PS_FILE);
- system(cmd);
- }
- return 0;
-}
-
-#endif
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/vdi.h
--- a/tools/blktap/vdi.h Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,55 +0,0 @@
-#ifndef _VDI_H_
-#define _VDI_H_
-/**************************************************************************
- *
- * vdi.h
- *
- * Virtual Disk Image (VDI) Interfaces
- *
- */
-
-#ifndef __VDI_H__
-#define __VDI_H__
-
-#include "blktaplib.h"
-#include "snaplog.h"
-
-#define VDI_HEIGHT 27 /* Note that these are now hard-coded */
-#define VDI_REG_HEIGHT 27 /* in the async lookup code */
-
-#define VDI_NAME_SZ 256
-
-
-typedef struct vdi {
- u64 id; /* unique vdi id -- used by the registry */
- u64 block; /* block where this vdi lives (also unique)*/
- u64 radix_root; /* radix root node for block mappings */
- snap_id_t snap; /* next snapshot slot for this VDI */
- struct vdi *next; /* used to hash-chain in blkif. */
- blkif_vdev_t vdevice; /* currently mounted as... */
- struct radix_lock *radix_lock;/* per-line L1 RW lock for parallel reqs */
- char name[VDI_NAME_SZ];/* human readable vdi name */
-} vdi_t;
-
-#define VDI_REG_MAGIC 0xff00ff0bb0ff00ffLL
-
-typedef struct vdi_registry {
- u64 magic;
- u64 nr_vdis;
-} vdi_registry_t;
-
-
-int __init_vdi(void);
-
-vdi_t *vdi_get(u64 vdi_id);
-void vdi_put(vdi_t *vdi);
-vdi_registry_t *get_vdi_registry(void);
-vdi_t *vdi_create(snap_id_t *parent_snap, char *name);
-u64 vdi_lookup_block(vdi_t *vdi, u64 vdi_block, int *writable);
-void vdi_update_block(vdi_t *vdi, u64 vdi_block, u64 g_block);
-void vdi_snapshot(vdi_t *vdi);
-
-
-#endif /* __VDI_H__ */
-
-#endif //_VDI_H_
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/vdi_create.c
--- a/tools/blktap/vdi_create.c Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,52 +0,0 @@
-/**************************************************************************
- *
- * vdi_create.c
- *
- * Create a new vdi.
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include "blockstore.h"
-#include "radix.h"
-#include "vdi.h"
-
-int main(int argc, char *argv[])
-{
- vdi_t *vdi;
- char name[VDI_NAME_SZ] = "";
- snap_id_t id;
- int from_snap = 0;
-
- __init_blockstore();
- __init_vdi();
-
- if ( argc == 1 ) {
- printf("usage: %s <VDI Name> [<snap block> <snap idx>]\n", argv[0]);
- exit(-1);
- }
-
- strncpy( name, argv[1], VDI_NAME_SZ);
- name[VDI_NAME_SZ] = '\0';
-
- if ( argc > 3 ) {
- id.block = (u64) atoll(argv[2]);
- id.index = (unsigned int) atol (argv[3]);
- from_snap = 1;
- }
-
- vdi = vdi_create( from_snap ? &id : NULL, name);
-
- if ( vdi == NULL ) {
- printf("Failed to create VDI!\n");
- freeblock(vdi);
- exit(-1);
- }
-
- freeblock(vdi);
-
- return (0);
-}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/vdi_fill.c
--- a/tools/blktap/vdi_fill.c Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,81 +0,0 @@
-/**************************************************************************
- *
- * vdi_fill.c
- *
- * Hoover a file or device into a vdi.
- * You must first create the vdi with vdi_create.
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include "blockstore.h"
-#include "radix.h"
-#include "requests-async.h"
-#include "vdi.h"
-
-int main(int argc, char *argv[])
-{
- vdi_t *vdi;
- u64 id;
- int fd;
- struct stat st;
- u64 tot_size;
- char spage[BLOCK_SIZE];
- char *dpage;
- u64 vblock = 0, count=0;
-
- __init_blockstore();
- init_block_async();
- __init_vdi();
-
- if ( argc < 3 ) {
- printf("usage: %s <VDI id> <filename>\n", argv[0]);
- exit(-1);
- }
-
- id = (u64) atoll(argv[1]);
-
- vdi = vdi_get( id );
-
- if ( vdi == NULL ) {
- printf("Failed to retreive VDI %Ld!\n", id);
- exit(-1);
- }
-
- fd = open(argv[2], O_RDONLY | O_LARGEFILE);
-
- if (fd < 0) {
- printf("Couldn't open %s!\n", argv[2]);
- exit(-1);
- }
-
- if ( fstat(fd, &st) != 0 ) {
- printf("Couldn't stat %s!\n", argv[2]);
- exit(-1);
- }
-
- tot_size = (u64) st.st_size;
- printf("Filling VDI %Ld with %Ld bytes.\n", id, tot_size);
-
- printf("%011Ld blocks total\n", tot_size / BLOCK_SIZE);
- printf(" ");
- while ( ( count = read(fd, spage, BLOCK_SIZE) ) > 0 ) {
- vdi_write_s(vdi, vblock, spage);
-
- vblock++;
- if ((vblock % 512) == 0)
- printf("\b\b\b\b\b\b\b\b\b\b\b%011Ld", vblock);
- fflush(stdout);
- }
- printf("\n");
-
- freeblock(vdi);
-
- return (0);
-}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/vdi_list.c
--- a/tools/blktap/vdi_list.c Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,47 +0,0 @@
-/**************************************************************************
- *
- * vdi_list.c
- *
- * Print a list of VDIs on the block store.
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include "blockstore.h"
-#include "radix.h"
-#include "vdi.h"
-
-int main(int argc, char *argv[])
-{
- vdi_registry_t *reg;
- vdi_t *vdi;
- int i;
-
- __init_blockstore();
- __init_vdi();
-
- reg = get_vdi_registry();
-
- if ( reg == NULL ) {
- printf("couldn't get VDI registry.\n");
- exit(-1);
- }
-
- for (i=0; i < reg->nr_vdis; i++) {
- vdi = vdi_get(i);
-
- if ( vdi != NULL ) {
-
- printf("%10Ld %60s\n", vdi->id, vdi->name);
- freeblock(vdi);
-
- }
- }
-
- freeblock(reg);
-
- return 0;
-}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/vdi_snap.c
--- a/tools/blktap/vdi_snap.c Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,43 +0,0 @@
-/**************************************************************************
- *
- * vdi_snap.c
- *
- * Snapshot a vdi.
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include "blockstore.h"
-#include "radix.h"
-#include "vdi.h"
-
-int main(int argc, char *argv[])
-{
- vdi_t *vdi;
- u64 id;
-
- __init_blockstore();
- __init_vdi();
-
- if ( argc == 1 ) {
- printf("usage: %s <VDI id>\n", argv[0]);
- exit(-1);
- }
-
- id = (u64) atoll(argv[1]);
-
- vdi = vdi_get(id);
-
- if ( vdi == NULL ) {
- printf("couldn't find the requested VDI.\n");
- freeblock(vdi);
- exit(-1);
- }
-
- vdi_snapshot(vdi);
-
- return 0;
-}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/vdi_snap_delete.c
--- a/tools/blktap/vdi_snap_delete.c Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,48 +0,0 @@
-/**************************************************************************
- *
- * vdi_snap_delete.c
- *
- * Delete a snapshot.
- *
- * This is not finished: right now it takes a snap n and calls
- * snap_collapse(n,n+1).
- *
- * TODO: support for non-consecutive, non-same-block snaps
- * Avoid forking probs.
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include "blockstore.h"
-#include "snaplog.h"
-#include "radix.h"
-#include "vdi.h"
-
-int main(int argc, char *argv[])
-{
- snap_id_t id, c_id;
- int ret;
-
- __init_blockstore();
- __init_vdi();
-
- if ( argc != 3 ) {
- printf("usage: %s <snap block> <snap idx>\n", argv[0]);
- exit(-1);
- }
-
- id.block = (u64) atoll(argv[1]);
- id.index = (unsigned int) atol (argv[2]);
-
- c_id = id;
- c_id.index++;
-
- ret = snap_collapse(VDI_HEIGHT, &id, &c_id);
-
- printf("Freed %d blocks.\n", ret);
-
- return 0;
-}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/vdi_snap_list.c
--- a/tools/blktap/vdi_snap_list.c Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,82 +0,0 @@
-/**************************************************************************
- *
- * vdi_snap_list.c
- *
- * Print a list of snapshots for the specified vdi.
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-#include <sys/time.h>
-#include "blockstore.h"
-#include "radix.h"
-#include "vdi.h"
-
-int main(int argc, char *argv[])
-{
- vdi_t *vdi;
- u64 id;
- int i, max_snaps = -1;
- snap_block_t *blk;
- snap_id_t sid;
- char *t;
-
- __init_blockstore();
- __init_vdi();
-
- if ( argc == 1 ) {
- printf("usage: %s <VDI id> [max snaps]\n", argv[0]);
- exit(-1);
- }
-
- id = (u64) atoll(argv[1]);
-
- if ( argc > 2 ) {
- max_snaps = atoi(argv[2]);
- }
-
- vdi = vdi_get(id);
-
- if ( vdi == NULL ) {
- printf("couldn't find the requested VDI.\n");
- freeblock(vdi);
- exit(-1);
- }
-
- sid = vdi->snap;
- sid.index--;
-
- //printf("%8s%4s%21s %12s %1s\n", "Block", "idx", "timestamp",
- // "radix root", "d");
- printf("%8s%4s%37s %12s %1s\n", "Block", "idx", "timestamp",
- "radix root", "d");
-
- while (sid.block != 0) {
- blk = snap_get_block(sid.block);
- for (i = sid.index; i >= 0; i--) {
- if ( max_snaps == 0 ) {
- freeblock(blk);
- goto done;
- }
- t = ctime(&blk->snaps[i].timestamp.tv_sec);
- t[strlen(t)-1] = '\0';
- //printf("%8Ld%4u%14lu.%06lu %12Ld %1s\n",
- printf("%8Ld%4u%30s %06lu %12Ld %1s\n",
- sid.block, i,
- //blk->snaps[i].timestamp.tv_sec,
- t,
- blk->snaps[i].timestamp.tv_usec,
- blk->snaps[i].radix_root,
- blk->snaps[i].deleted ? "*" : " ");
- if ( max_snaps != -1 )
- max_snaps--;
- }
- sid = blk->hdr.parent_block;
- freeblock(blk);
- }
-done:
- return 0;
-}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/vdi_tree.c
--- a/tools/blktap/vdi_tree.c Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,132 +0,0 @@
-/**************************************************************************
- *
- * vdi_tree.c
- *
- * Output current vdi tree to dot and postscript.
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include "blockstore.h"
-#include "radix.h"
-#include "vdi.h"
-
-#define GRAPH_DOT_FILE "vdi.dot"
-#define GRAPH_PS_FILE "vdi.ps"
-
-typedef struct sh_st {
- snap_id_t id;
- struct sh_st *next;
-} sh_t;
-
-#define SNAP_HASHSZ 1024
-sh_t *node_hash[SNAP_HASHSZ];
-#define SNAP_HASH(_id) (((int)(_id)->block^(_id)->index)%SNAP_HASHSZ)
-
-#define SNAPID_EQUAL(_a,_b) \
- (((_a)->block==(_b)->block) && ((_a)->index==(_b)->index))
-int sh_check_and_add(snap_id_t *id)
-{
- sh_t **s = &node_hash[SNAP_HASH(id)];
-
- while (*s != NULL) {
- if (SNAPID_EQUAL(&((*s)->id), id))
- return 1;
- *s = (*s)->next;
- }
-
- *s = (sh_t *)malloc(sizeof(sh_t));
- (*s)->id = *id;
- (*s)->next = NULL;
-
- return 0;
-}
-
-int main(int argc, char *argv[])
-{
- FILE *f;
- char dot_file[255] = GRAPH_DOT_FILE;
- char ps_file[255] = GRAPH_PS_FILE;
- int nr_vdis = 0, nr_forks = 0;
- vdi_registry_t *reg;
- vdi_t *vdi;
- int i;
-
- __init_blockstore();
- __init_vdi();
-
- reg = get_vdi_registry();
-
- if ( reg == NULL ) {
- printf("couldn't get VDI registry.\n");
- exit(-1);
- }
-
- if ( argc > 1 ) {
- strncpy(ps_file, argv[1], 255);
- ps_file[255] = '\0';
- }
-
- /* now dump it out to a dot file. */
- printf("[o] Dumping state to a dot graph. (%d VDIs)\n", nr_vdis);
-
- f = fopen(dot_file, "w");
-
- /* write graph preamble */
- fprintf(f, "digraph G {\n");
- fprintf(f, " rankdir=LR\n");
-
- for (i=0; i<reg->nr_vdis; i++) {
- char oldnode[255];
- snap_block_t *blk;
- snap_id_t id;
- int nr_snaps, done=0;
-
- vdi = vdi_get(i);
- id = vdi->snap;
- /* add a node for the id */
-printf("vdi: %d\n", i);
- fprintf(f, " n%Ld%d
[color=blue,shape=box,label=\"%s\\nb:%Ld\\nidx:%d\"]\n",
- id.block, id.index, vdi->name,
- id.block, id.index);
- sprintf(oldnode, "n%Ld%d", id.block, id.index);
-
- while (id.block != 0) {
- blk = snap_get_block(id.block);
- nr_snaps = blk->hdr.log_entries - (blk->hdr.nr_entries - id.index);
- id = blk->hdr.fork_block;
-
- done = sh_check_and_add(&id);
-
- /* add a node for the fork_id */
- if (!done) {
- fprintf(f, " n%Ld%d [shape=box,label=\"b:%Ld\\nidx:%d\"]\n",
- id.block, id.index,
- id.block, id.index);
- }
-
- /* add an edge between them */
- fprintf(f, " n%Ld%d -> %s [label=\"%u snapshots\"]\n",
- id.block, id.index, oldnode, nr_snaps);
- sprintf(oldnode, "n%Ld%d", id.block, id.index);
- freeblock(blk);
-
- if (done) break;
- }
- }
-
- /* write graph postamble */
- fprintf(f, "}\n");
- fclose(f);
-
- printf("[o] Generating postscript graph. (%s)\n", GRAPH_PS_FILE);
- {
- char cmd[255];
- sprintf(cmd, "dot %s -Tps -o %s", dot_file, ps_file);
- system(cmd);
- }
- return 0;
-}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/vdi_unittest.c
--- a/tools/blktap/vdi_unittest.c Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,184 +0,0 @@
-/**************************************************************************
- *
- * vdi_unittest.c
- *
- * Run a small test workload to ensure that data access through a vdi
- * is (at least superficially) correct.
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include "requests-async.h"
-#include "blockstore.h"
-#include "radix.h"
-#include "vdi.h"
-
-#define TEST_PAGES 32
-static char *zero_page;
-static char pages[TEST_PAGES][BLOCK_SIZE];
-static int next_page = 0;
-
-void fill_test_pages(void)
-{
- int i, j;
- long *page;
-
- for (i=0; i< TEST_PAGES; i++) {
- page = (unsigned long *)pages[i];
- for (j=0; j<(BLOCK_SIZE/4); j++) {
- page[j] = random();
- }
- }
-
- zero_page = newblock();
-}
-
-inline u64 make_vaddr(u64 L1, u64 L2, u64 L3)
-{
- u64 ret = L1;
-
- ret = (ret << 9) | L2;
- ret = (ret << 9) | L3;
-
- return ret;
-}
-
-void touch_block(vdi_t *vdi, u64 L1, u64 L2, u64 L3)
-{
- u64 vaddr;
- char *page = pages[next_page++];
- char *rpage = NULL;
-
- printf("TOUCH (%3Lu, %3Lu, %3Lu)\n", L1, L2, L3);
-
- vaddr = make_vaddr(L1, L2, L3);
- vdi_write_s(vdi, vaddr, page);
- rpage = vdi_read_s(vdi, vaddr);
-
- if (rpage == NULL)
- {
- printf( "read %Lu returned NULL\n", vaddr);
- return;
- }
-
- if (memcmp(page, rpage, BLOCK_SIZE) != 0)
- {
- printf( "read %Lu returned a different page\n", vaddr);
- return;
- }
-
- freeblock(rpage);
-}
-
-void test_block(vdi_t *vdi, u64 L1, u64 L2, u64 L3, char *page)
-{
- u64 vaddr;
- char *rpage = NULL;
-
- printf("TEST (%3Lu, %3Lu, %3Lu)\n", L1, L2, L3);
-
- vaddr = make_vaddr(L1, L2, L3);
- rpage = vdi_read_s(vdi, vaddr);
-
- if (rpage == NULL)
- {
- printf( "read %Lu returned NULL\n", vaddr);
- return;
- }
-
- if (memcmp(page, rpage, BLOCK_SIZE) != 0)
- {
- printf( "read %Lu returned a different page\n", vaddr);
- return;
- }
-
- freeblock(rpage);
-}
-
-void coverage_test(vdi_t *vdi)
-{
- u64 vaddr;
- int i, j, k;
-
- /* Do a series of writes and reads to test all paths through the
- * async radix code. The radix request code will dump CRC warnings
- * if there are data problems here as well.
- */
-
- /* L1 Zero */
- touch_block(vdi, 0, 0, 0);
-
- /* L2 Zero */
- i = next_page;
- touch_block(vdi, 0, 1, 0);
-
- /* L3 Zero */
- j = next_page;
- touch_block(vdi, 0, 0, 1);
- k = next_page;
- touch_block(vdi, 0, 1, 1);
-
- /* Direct write */
- touch_block(vdi, 0, 0, 0);
-
- vdi_snapshot(vdi);
-
- /* L1 fault */
- touch_block(vdi, 0, 0, 0);
- /* test the read-only branches that should have been copied over. */
- test_block(vdi, 0, 1, 0, pages[i]);
- test_block(vdi, 0, 0, 1, pages[j]);
-
- /* L2 fault */
- touch_block(vdi, 0, 1, 0);
- test_block(vdi, 0, 1, 1, pages[k]);
-
- /* L3 fault */
- touch_block(vdi, 0, 0, 1);
-
- /* read - L1 zero */
- test_block(vdi, 1, 0, 0, zero_page);
-
- /* read - L2 zero */
- test_block(vdi, 0, 2, 0, zero_page);
-
- /* read - L3 zero */
- test_block(vdi, 0, 0, 2, zero_page);
-}
-
-int main(int argc, char *argv[])
-{
- vdi_t *vdi;
- u64 id;
- int fd;
- struct stat st;
- u64 tot_size;
- char spage[BLOCK_SIZE];
- char *dpage;
- u64 vblock = 0, count=0;
-
- __init_blockstore();
- init_block_async();
- __init_vdi();
-
- vdi = vdi_create( NULL, "UNIT TEST VDI");
-
- if ( vdi == NULL ) {
- printf("Failed to create VDI!\n");
- freeblock(vdi);
- exit(-1);
- }
-
- fill_test_pages();
- coverage_test(vdi);
-
- freeblock(vdi);
-
- return (0);
-}
diff -r 99ff7c3435b2 -r abbc1d071e22 tools/blktap/vdi_validate.c
--- a/tools/blktap/vdi_validate.c Sun Jul 3 12:02:01 2005
+++ /dev/null Sun Jul 3 14:14:09 2005
@@ -1,97 +0,0 @@
-/**************************************************************************
- *
- * vdi_validate.c
- *
- * Intended to sanity-check vm_fill and the underlying vdi code.
- *
- * Block-by-block compare of a vdi with a file/device on the disk.
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include "blockstore.h"
-#include "radix.h"
-#include "vdi.h"
-#include "requests-async.h"
-
-int main(int argc, char *argv[])
-{
- vdi_t *vdi;
- u64 id;
- int fd;
- struct stat st;
- u64 tot_size;
- char spage[BLOCK_SIZE], *dpage;
- char *vpage;
- u64 vblock = 0, count=0;
-
- __init_blockstore();
- init_block_async();
- __init_vdi();
-
- if ( argc < 3 ) {
- printf("usage: %s <VDI id> <filename>\n", argv[0]);
- exit(-1);
- }
-
- id = (u64) atoll(argv[1]);
-
- vdi = vdi_get( id );
-
- if ( vdi == NULL ) {
- printf("Failed to retreive VDI %Ld!\n", id);
- exit(-1);
- }
-
- fd = open(argv[2], O_RDONLY | O_LARGEFILE);
-
- if (fd < 0) {
- printf("Couldn't open %s!\n", argv[2]);
- exit(-1);
- }
-
- if ( fstat(fd, &st) != 0 ) {
- printf("Couldn't stat %s!\n", argv[2]);
- exit(-1);
- }
-
- tot_size = (u64) st.st_size;
- printf("Testing VDI %Ld (%Ld bytes).\n", id, tot_size);
-
- printf(" ");
- while ( ( count = read(fd, spage, BLOCK_SIZE) ) > 0 ) {
-
- dpage = vdi_read_s(vdi, vblock);
-
- if (dpage == NULL) {
- printf("\n\nfound an unmapped VDI block (%Ld)\n", vblock);
- exit(0);
- }
-
- if (memcmp(spage, dpage, BLOCK_SIZE) != 0) {
- printf("\n\nblocks don't match! (%Ld)\n", vblock);
- exit(0);
- }
-
- freeblock(dpage);
-
- vblock++;
- if ((vblock % 1024) == 0) {
- printf("\b\b\b\b\b\b\b\b\b\b\b%011Ld", vblock);
- fflush(stdout);
- }
- }
- printf("\n");
-
- printf("VDI %Ld looks good!\n", id);
-
- freeblock(vdi);
-
- return (0);
-}
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|