WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] [xen-unstable] blktap2: a completely rewritten blktap im

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [xen-unstable] blktap2: a completely rewritten blktap implementation
From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Wed, 27 May 2009 04:30:42 -0700
Delivery-date: Wed, 27 May 2009 04:32:42 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1243335151 -3600
# Node ID 1c627434605e7747689047e1761c193ceb4f9ef0
# Parent  f210a633571c17c1a5e47980d53b00b0cab5b726
blktap2: a completely rewritten blktap implementation

Benefits to blktap2 over the old version of blktap:

* Isolation from xenstore - Blktap devices are now created directly on
   the linux dom0 command line, rather than being spawned in response
   to XenStore events.  This is handy for debugging, makes blktap
   generally easier to work with, and is a step toward a generic
   user-level block device implementation that is not Xen-specific.

* Improved tapdisk infrastructure: simpler request forwarding, new
   request scheduler, request merging, more efficient use of AIO.

* Improved tapdisk error handling and memory management.  No
   allocations on the block data path, IO retry logic to protect
   guests
   transient block device failures.  This has been tested and is known
   to work on weird environments such as NFS soft mounts.

* Pause and snapshot of live virtual disks (see xmsnap script).

* VHD support.  The VHD code in this release has been rigorously
   tested, and represents a very mature implementation of the VHD
   image
   format.

* No more duplication of mechanism with blkback.  The blktap kernel
   module has changed dramatically from the original blktap.  Blkback
   is now always used to talk to Xen guests, blktap just presents a
   Linux gendisk that blkback can export.  This is done while
   preserving the zero-copy data path from domU to physical device.

These patches deprecate the old blktap code, which can hopefully be
removed from the tree completely at some point in the future.

Signed-off-by: Jake Wires <jake.wires@xxxxxxxxxx>
Signed-off-by: Dutch Meyer <dmeyer@xxxxxxxxx>
---
 .hgignore                                        |   14 
 tools/Makefile                                   |    1 
 tools/blktap2/Makefile                           |   34 
 tools/blktap2/README                             |  122 
 tools/blktap2/daemon/Makefile                    |   55 
 tools/blktap2/daemon/lib/Makefile                |   69 
 tools/blktap2/daemon/lib/xs_api.c                |  323 ++
 tools/blktap2/daemon/lib/xs_api.h                |   62 
 tools/blktap2/daemon/tapdisk-channel.c           | 1367 +++++++++
 tools/blktap2/daemon/tapdisk-daemon.c            |  599 ++++
 tools/blktap2/daemon/tapdisk-dispatch-common.c   |   94 
 tools/blktap2/daemon/tapdisk-dispatch.h          |   95 
 tools/blktap2/drivers/Makefile                   |  105 
 tools/blktap2/drivers/aes.c                      | 1319 +++++++++
 tools/blktap2/drivers/aes.h                      |   28 
 tools/blktap2/drivers/atomicio.c                 |   61 
 tools/blktap2/drivers/blk.h                      |   30 
 tools/blktap2/drivers/blk_linux.c                |   43 
 tools/blktap2/drivers/blktap2.h                  |   66 
 tools/blktap2/drivers/block-aio.c                |  272 +
 tools/blktap2/drivers/block-cache.c              |  787 +++++
 tools/blktap2/drivers/block-log.c                |  688 ++++
 tools/blktap2/drivers/block-qcow.c               | 1517 ++++++++++
 tools/blktap2/drivers/block-ram.c                |  269 +
 tools/blktap2/drivers/block-vhd.c                | 2321 ++++++++++++++++
 tools/blktap2/drivers/bswap.h                    |  214 +
 tools/blktap2/drivers/check_gcrypt               |   14 
 tools/blktap2/drivers/disktypes.h                |  184 +
 tools/blktap2/drivers/img2qcow.c                 |  318 ++
 tools/blktap2/drivers/io-optimize.c              |  664 ++++
 tools/blktap2/drivers/io-optimize.h              |   68 
 tools/blktap2/drivers/lock.c                     | 1000 ++++++
 tools/blktap2/drivers/lock.h                     |   51 
 tools/blktap2/drivers/log.h                      |  123 
 tools/blktap2/drivers/profile.h                  |  191 +
 tools/blktap2/drivers/qcow-create.c              |  121 
 tools/blktap2/drivers/qcow.h                     |  131 
 tools/blktap2/drivers/qcow2raw.c                 |  449 +++
 tools/blktap2/drivers/scheduler.c                |  265 +
 tools/blktap2/drivers/scheduler.h                |   65 
 tools/blktap2/drivers/tapdisk-client.c           |  496 +++
 tools/blktap2/drivers/tapdisk-diff.c             |  797 +++++
 tools/blktap2/drivers/tapdisk-driver.c           |  100 
 tools/blktap2/drivers/tapdisk-driver.h           |   62 
 tools/blktap2/drivers/tapdisk-filter.c           |  271 +
 tools/blktap2/drivers/tapdisk-filter.h           |   67 
 tools/blktap2/drivers/tapdisk-image.c            |  160 +
 tools/blktap2/drivers/tapdisk-image.h            |   55 
 tools/blktap2/drivers/tapdisk-interface.c        |  250 +
 tools/blktap2/drivers/tapdisk-interface.h        |   53 
 tools/blktap2/drivers/tapdisk-ipc.c              |  279 +
 tools/blktap2/drivers/tapdisk-ipc.h              |   43 
 tools/blktap2/drivers/tapdisk-log.c              |  255 +
 tools/blktap2/drivers/tapdisk-log.h              |   51 
 tools/blktap2/drivers/tapdisk-queue.c            |  441 +++
 tools/blktap2/drivers/tapdisk-queue.h            |  113 
 tools/blktap2/drivers/tapdisk-ring.c             |  439 +++
 tools/blktap2/drivers/tapdisk-ring.h             |   87 
 tools/blktap2/drivers/tapdisk-server.c           |  415 ++
 tools/blktap2/drivers/tapdisk-server.h           |   65 
 tools/blktap2/drivers/tapdisk-stream.c           |  600 ++++
 tools/blktap2/drivers/tapdisk-utils.c            |  199 +
 tools/blktap2/drivers/tapdisk-utils.h            |   42 
 tools/blktap2/drivers/tapdisk-vbd.c              | 1758 ++++++++++++
 tools/blktap2/drivers/tapdisk-vbd.h              |  193 +
 tools/blktap2/drivers/tapdisk.c                  |   66 
 tools/blktap2/drivers/tapdisk.h                  |  158 +
 tools/blktap2/drivers/tapdisk2.c                 |  436 +++
 tools/blktap2/drivers/td.c                       |  691 ++++
 tools/blktap2/drivers/xmsnap                     |   78 
 tools/blktap2/include/Makefile                   |   14 
 tools/blktap2/include/atomicio.h                 |   33 
 tools/blktap2/include/blktaplib.h                |  249 +
 tools/blktap2/include/libvhd-journal.h           |   68 
 tools/blktap2/include/libvhd.h                   |  308 ++
 tools/blktap2/include/list.h                     |   93 
 tools/blktap2/include/lvm-util.h                 |   71 
 tools/blktap2/include/relative-path.h            |   43 
 tools/blktap2/include/tapdisk-message.h          |  141 
 tools/blktap2/include/vhd-util.h                 |   44 
 tools/blktap2/include/vhd.h                      |  221 +
 tools/blktap2/lvm/Makefile                       |   38 
 tools/blktap2/lvm/lvm-util.c                     |  349 ++
 tools/blktap2/vhd/Makefile                       |   55 
 tools/blktap2/vhd/lib/Makefile                   |   73 
 tools/blktap2/vhd/lib/atomicio.c                 |   61 
 tools/blktap2/vhd/lib/libvhd-journal.c           | 1534 ++++++++++
 tools/blktap2/vhd/lib/libvhd.c                   | 3328 +++++++++++++++++++++++
 tools/blktap2/vhd/lib/relative-path.c            |  299 ++
 tools/blktap2/vhd/lib/vhd-util-check.c           |  977 ++++++
 tools/blktap2/vhd/lib/vhd-util-coalesce.c        |  218 +
 tools/blktap2/vhd/lib/vhd-util-create.c          |   80 
 tools/blktap2/vhd/lib/vhd-util-fill.c            |  105 
 tools/blktap2/vhd/lib/vhd-util-modify.c          |  132 
 tools/blktap2/vhd/lib/vhd-util-query.c           |  159 +
 tools/blktap2/vhd/lib/vhd-util-read.c            |  742 +++++
 tools/blktap2/vhd/lib/vhd-util-repair.c          |   84 
 tools/blktap2/vhd/lib/vhd-util-resize.c          | 1131 +++++++
 tools/blktap2/vhd/lib/vhd-util-revert.c          |  106 
 tools/blktap2/vhd/lib/vhd-util-scan.c            | 1315 +++++++++
 tools/blktap2/vhd/lib/vhd-util-set-field.c       |  106 
 tools/blktap2/vhd/lib/vhd-util-snapshot.c        |  216 +
 tools/blktap2/vhd/vhd-update.c                   |  261 +
 tools/blktap2/vhd/vhd-util.c                     |  160 +
 tools/check/check_uuid_devel                     |    6 
 tools/python/xen/xend/XendDomainInfo.py          |   49 
 tools/python/xen/xend/server/BlktapController.py |   54 
 tools/python/xen/xend/server/DevController.py    |   32 
 108 files changed, 35869 insertions(+), 5 deletions(-)

diff -r f210a633571c -r 1c627434605e .hgignore
--- a/.hgignore Tue May 26 11:05:04 2009 +0100
+++ b/.hgignore Tue May 26 11:52:31 2009 +0100
@@ -103,7 +103,19 @@
 ^stubdom/lwip/
 ^stubdom/ioemu/
 ^tools/.*/build/lib.*/.*\.py$
-^tools/blktap/Makefile\.smh$
+^tools/blktap2/daemon/blktapctrl$
+^tools/blktap2/drivers/img2qcow$
+^tools/blktap2/drivers/lock-util$
+^tools/blktap2/drivers/qcow-create$
+^tools/blktap2/drivers/qcow2raw$
+^tools/blktap2/drivers/tapdisk$
+^tools/blktap2/drivers/tapdisk-client$
+^tools/blktap2/drivers/tapdisk-diff$
+^tools/blktap2/drivers/tapdisk-stream$
+^tools/blktap2/drivers/tapdisk2$
+^tools/blktap2/drivers/td-util$
+^tools/blktap2/vhd/vhd-update$
+^tools/blktap2/vhd/vhd-util$
 ^tools/blktap/drivers/blktapctrl$
 ^tools/blktap/drivers/img2qcow$
 ^tools/blktap/drivers/qcow-create$
diff -r f210a633571c -r 1c627434605e tools/Makefile
--- a/tools/Makefile    Tue May 26 11:05:04 2009 +0100
+++ b/tools/Makefile    Tue May 26 11:52:31 2009 +0100
@@ -22,6 +22,7 @@ SUBDIRS-y += xenstat
 SUBDIRS-y += xenstat
 SUBDIRS-$(CONFIG_Linux) += libaio
 SUBDIRS-$(CONFIG_Linux) += blktap
+SUBDIRS-$(CONFIG_Linux) += blktap2
 SUBDIRS-y += libfsimage
 SUBDIRS-$(LIBXENAPI_BINDINGS) += libxen
 SUBDIRS-$(CONFIG_Linux) += fs-back
diff -r f210a633571c -r 1c627434605e tools/blktap2/Makefile
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap2/Makefile    Tue May 26 11:52:31 2009 +0100
@@ -0,0 +1,34 @@
+XEN_ROOT = ../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+CFLAGS  += $(CFLAGS_libxenctrl)
+LDFLAGS += $(LDFLAGS_libxenctrl)
+
+SUBDIRS-y :=
+SUBDIRS-y += include
+SUBDIRS-y += lvm
+SUBDIRS-y += vhd
+SUBDIRS-y += drivers
+SUBDIRS-y += daemon
+
+.PHONY: all
+all: build
+
+.PHONY: build
+build:
+       @set -e; for subdir in $(SUBDIRS-y); do \
+       $(MAKE) -C $$subdir all;       \
+               done
+
+.PHONY: install
+install:
+       @set -e; for subdir in $(SUBDIRS-y); do \
+               $(MAKE) -C $$subdir install; \
+       done
+
+.PHONY: clean
+clean:
+       rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) TAGS
+       @set -e; for subdir in $(SUBDIRS-y); do \
+       $(MAKE) -C $$subdir clean;       \
+               done
diff -r f210a633571c -r 1c627434605e tools/blktap2/README
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap2/README      Tue May 26 11:52:31 2009 +0100
@@ -0,0 +1,122 @@
+Blktap Userspace Tools + Library
+================================
+
+Andrew Warfield and Julian Chesterfield
+16th June 2006
+
+{firstname.lastname}@cl.cam.ac.uk
+
+The blktap userspace toolkit provides a user-level disk I/O
+interface. The blktap mechanism involves a kernel driver that acts
+similarly to the existing Xen/Linux blkback driver, and a set of
+associated user-level libraries.  Using these tools, blktap allows
+virtual block devices presented to VMs to be implemented in userspace
+and to be backed by raw partitions, files, network, etc.
+
+The key benefit of blktap is that it makes it easy and fast to write
+arbitrary block backends, and that these user-level backends actually
+perform very well.  Specifically:
+
+- Metadata disk formats such as Copy-on-Write, encrypted disks, sparse
+  formats and other compression features can be easily implemented.
+
+- Accessing file-based images from userspace avoids problems related
+  to flushing dirty pages which are present in the Linux loopback
+  driver.  (Specifically, doing a large number of writes to an
+  NFS-backed image don't result in the OOM killer going berserk.)
+
+- Per-disk handler processes enable easier userspace policing of block
+  resources, and process-granularity QoS techniques (disk scheduling
+  and related tools) may be trivially applied to block devices.
+
+- It's very easy to take advantage of userspace facilities such as
+  networking libraries, compression utilities, peer-to-peer
+  file-sharing systems and so on to build more complex block backends.
+
+- Crashes are contained -- incremental development/debugging is very
+  fast.
+
+How it works (in one paragraph):
+
+Working in conjunction with the kernel blktap driver, all disk I/O
+requests from VMs are passed to the userspace deamon (using a shared
+memory interface) through a character device. Each active disk is
+mapped to an individual device node, allowing per-disk processes to
+implement individual block devices where desired.  The userspace
+drivers are implemented using asynchronous (Linux libaio),
+O_DIRECT-based calls to preserve the unbuffered, batched and
+asynchronous request dispatch achieved with the existing blkback
+code.  We provide a simple, asynchronous virtual disk interface that
+makes it quite easy to add new disk implementations.
+
+As of June 2006 the current supported disk formats are:
+
+ - Raw Images (both on partitions and in image files)
+ - File-backed Qcow disks
+ - Standalone sparse Qcow disks
+ - Fast shareable RAM disk between VMs (requires some form of cluster-based 
+   filesystem support e.g. OCFS2 in the guest kernel)
+ - Some VMDK images - your mileage may vary
+
+Raw and QCow images have asynchronous backends and so should perform
+fairly well.  VMDK is based directly on the qemu vmdk driver, which is
+synchronous (a.k.a. slow).
+
+Build and Installation Instructions
+===================================
+
+Make to configure the blktap backend driver in your dom0 kernel.  It
+will cooperate fine with the existing backend driver, so you can
+experiment with tap disks without breaking existing VM configs.
+
+To build the tools separately, "make && make install" in 
+tools/blktap.
+
+
+Using the Tools
+===============
+
+Prepare the image for booting. For qcow files use the qcow utilities
+installed earlier. e.g. qcow-create generates a blank standalone image
+or a file-backed CoW image. img2qcow takes an existing image or
+partition and creates a sparse, standalone qcow-based file.
+
+The userspace disk agent is configured to start automatically via xend
+(alternatively you can start it manually => 'blktapctrl')
+
+Customise the VM config file to use the 'tap' handler, followed by the
+driver type. e.g. for a raw image such as a file or partition:
+
+disk = ['tap:aio:<FILENAME>,sda1,w']
+
+e.g. for a qcow image:
+
+disk = ['tap:qcow:<FILENAME>,sda1,w']
+
+
+Mounting images in Dom0 using the blktap driver
+===============================================
+Tap (and blkback) disks are also mountable in Dom0 without requiring an
+active VM to attach. You will need to build a xenlinux Dom0 kernel that
+includes the blkfront driver (e.g. the default 'make world' or 
+'make kernels' build. Simply use the xm command-line tool to activate
+the backend disks, and blkfront will generate a virtual block device that
+can be accessed in the same way as a loop device or partition:
+
+e.g. for a raw image file <FILENAME> that would normally be mounted using
+the loopback driver (such as 'mount -o loop <FILENAME> /mnt/disk'), do the
+following:
+
+xm block-attach 0 tap:aio:<FILENAME> /dev/xvda1 w 0
+mount /dev/xvda1 /mnt/disk        <--- don't use loop driver
+
+In this way, you can use any of the userspace device-type drivers built
+with the blktap userspace toolkit to open and mount disks such as qcow
+or vmdk images:
+
+xm block-attach 0 tap:qcow:<FILENAME> /dev/xvda1 w 0
+mount /dev/xvda1 /mnt/disk
+
+
+
+ 
diff -r f210a633571c -r 1c627434605e tools/blktap2/daemon/Makefile
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap2/daemon/Makefile     Tue May 26 11:52:31 2009 +0100
@@ -0,0 +1,55 @@
+XEN_ROOT=../../../
+BLKTAP_ROOT := ..
+include $(XEN_ROOT)/tools/Rules.mk
+
+IBIN          = blktapctrl
+INST_DIR      = $(SBINDIR)
+
+LIBDIR        = lib
+
+LIBS         := -lxenstore
+LIBS         += -Llib
+LIBS         += -lblktap
+LIBS         += -lxenctrl
+
+ifneq ($(USE_SYSTEM_LIBRARIES),y)
+INCLUDES     += -I $(XEN_LIBXC) -I $(XEN_XENSTORE)
+LIBS         += -L $(XEN_LIBXC) -L $(XEN_XENSTORE)
+endif
+
+OBJS         := tapdisk-dispatch-common.o
+OBJS         += tapdisk-channel.o
+
+CFLAGS       += -Werror
+CFLAGS       += -Wno-unused
+CFLAGS       += -fno-strict-aliasing -fPIC
+CFLAGS       += -Ilib -I../include -I../drivers -I../../include $(INCLUDES)
+CFLAGS       += -D_GNU_SOURCE
+CFLAGS       += -g
+
+# Get gcc to generate the dependencies for us.
+CFLAGS       += -Wp,-MD,.$(@F).d
+DEPS          = .*.d
+
+all: libblktap $(IBIN)
+
+blktapctrl: tapdisk-daemon.c $(OBJS)
+       $(CC) $(CFLAGS) -o blktapctrl tapdisk-daemon.c $(LIBS) $(OBJS)
+
+libblktap:
+       @set -e
+       $(MAKE) -C $(LIBDIR) all
+
+install: all
+       $(MAKE) -C $(LIBDIR) install
+       $(INSTALL_DIR) -p $(DESTDIR)$(INST_DIR)
+       $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(INST_DIR)
+
+clean:
+       $(MAKE) -C $(LIBDIR) clean
+       rm -rf *.o *~ $(IBIN) $(DEPS) xen TAGS
+
+.PHONY: all clean install blktapctrl libblktap
+
+-include $(DEPS)
+
diff -r f210a633571c -r 1c627434605e tools/blktap2/daemon/lib/Makefile
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap2/daemon/lib/Makefile Tue May 26 11:52:31 2009 +0100
@@ -0,0 +1,69 @@
+XEN_ROOT=../../../../
+BLKTAP_ROOT := ../../
+include $(XEN_ROOT)/tools/Rules.mk
+
+MAJOR    = 3.1
+MINOR    = 0
+SONAME   = libblktap.so.$(MAJOR)
+
+BLKTAP_INSTALL_DIR = /usr/sbin
+
+LIBS     := -lxenstore
+
+ifneq ($(USE_SYSTEM_LIBRARIES),y)
+INCLUDES += -I $(XEN_LIBXC) -I $(XEN_XENSTORE)
+LIBS     += -L$(XEN_XENSTORE)
+endif
+
+SRCS     :=
+SRCS     += xs_api.c
+CFLAGS   += -Werror
+CFLAGS   += -Wno-unused
+CFLAGS   += -fno-strict-aliasing -fPIC
+# get asprintf():
+CFLAGS   += -D _GNU_SOURCE
+CFLAGS   += -g
+CFLAGS   += -I../../include -I../../../include/ $(INCLUDES) 
+
+
+# Get gcc to generate the dependencies for us.
+CFLAGS  += -Wp,-MD,.$(@F).d
+DEPS     = .*.d
+
+OBJS     = $(patsubst %.c,%.o,$(SRCS))
+IBINS   :=
+
+LIB      = libblktap.a libblktap.so.$(MAJOR).$(MINOR)
+
+.PHONY: all
+all: build
+
+.PHONY: build
+build: libblktap.a
+
+.PHONY: libblktap
+libblktap: libblktap.a
+
+install: all
+       $(INSTALL_DIR) -p $(DESTDIR)$(LIBDIR)
+       $(INSTALL_DATA) $(LIB) $(DESTDIR)$(LIBDIR)
+       ln -sf libblktap.so.$(MAJOR).$(MINOR) 
$(DESTDIR)$(LIBDIR)/libblktap.so.$(MAJOR)
+       ln -sf libblktap.so.$(MAJOR) $(DESTDIR)$(LIBDIR)/libblktap.so
+
+clean:
+       rm -rf *.a *.so* *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS
+
+libblktap.a: $(OBJS) 
+       $(CC) $(CFLAGS) -Wl,$(SONAME_LDFLAG) -Wl,$(SONAME) $(SHLIB_CFLAGS) \
+             -o libblktap.so.$(MAJOR).$(MINOR) $^ $(LIBS)
+       ln -sf libblktap.so.$(MAJOR).$(MINOR) libblktap.so.$(MAJOR)
+       ln -sf libblktap.so.$(MAJOR) libblktap.so
+       $(AR) rc $@ libblktap.so
+
+.PHONY: TAGS all build clean install libblktap
+
+TAGS:
+       etags -t $(SRCS) *.h
+
+-include $(DEPS)
+
diff -r f210a633571c -r 1c627434605e tools/blktap2/daemon/lib/xs_api.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap2/daemon/lib/xs_api.c Tue May 26 11:52:31 2009 +0100
@@ -0,0 +1,323 @@
+/*
+ * xs_api.c
+ * 
+ * blocktap interface functions to xenstore
+ *
+ * (c) 2005 Andrew Warfield and Julian Chesterfield
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include <time.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <xs.h>
+
+#include "xs_api.h"
+#include "blktaplib.h"
+
+#define DOMNAME "Domain-0"
+#define BASE_DEV_VAL 2048
+
+static LIST_HEAD(watches);
+
+int
+xs_gather(struct xs_handle *xs, const char *dir, ...)
+{
+       va_list ap;
+       const char *name;
+       char *path, **e;
+       int ret = 0, num,i;
+       unsigned int len;
+       xs_transaction_t xth;
+
+again:
+       if ((xth = xs_transaction_start(xs)) == XBT_NULL) {
+               DPRINTF("unable to start xs trasanction\n");
+               ret = ENOMEM;
+               return ret;
+       }
+
+       va_start(ap, dir);
+       while ((ret == 0) && (name = va_arg(ap, char *)) != NULL) {
+               char *p;
+               const char *fmt = va_arg(ap, char *);
+               void *result = va_arg(ap, void *);
+               
+               if (asprintf(&path, "%s/%s", dir, name) == -1) {
+                       EPRINTF("allocation error in xs_gather!\n");
+                       ret = ENOMEM;
+                       break;
+               }
+
+               p = xs_read(xs, xth, path, &len);
+               free(path);
+
+               if (!p) {
+                       ret = ENOENT;
+                       break;
+               }
+
+               if (fmt) {
+                       if (sscanf(p, fmt, result) == 0)
+                               ret = EINVAL;
+                       free(p);
+               } else
+                       *(char **)result = p;
+       }
+
+       va_end(ap);
+
+       if (!xs_transaction_end(xs, xth, ret)) {
+               if (ret == 0 && errno == EAGAIN)
+                       goto again;
+               else
+                       ret = errno;
+       }
+
+       return ret;
+}
+
+/* Single printf and write: returns -errno or 0. */
+int
+xs_printf(struct xs_handle *h, const char *dir,
+         const char *node, const char *fmt, ...)
+{
+       int ret;
+       va_list ap;
+       char *buf, *path;
+
+       va_start(ap, fmt);
+       ret = vasprintf(&buf, fmt, ap);
+       va_end(ap);
+
+       if (ret == -1)
+               return 0;
+
+       ret = asprintf(&path, "%s/%s", dir, node);
+       if (ret == -1) {
+               free(buf);
+               return 0;
+       }
+
+       ret = xs_write(h, XBT_NULL, path, buf, strlen(buf)+1);
+
+       free(buf);
+       free(path);
+
+       return ret;
+}
+
+int
+xs_exists(struct xs_handle *h, const char *path)
+{
+       char **d;
+       unsigned int num;
+       xs_transaction_t xth;
+
+       if ((xth = xs_transaction_start(h)) == XBT_NULL) {
+               EPRINTF("unable to start xs trasanction\n");
+               return 0;
+       }
+
+       d = xs_directory(h, xth, path, &num);
+       xs_transaction_end(h, xth, 0);
+       if (!d)
+               return 0;
+
+       free(d);
+       return 1;
+}
+
+
+
+/**
+ * This assumes that the domain name we are looking for is unique. 
+ * Name parameter Domain-0 
+ */
+char *
+get_dom_domid(struct xs_handle *h)
+{
+       int i;
+       xs_transaction_t xth;
+       unsigned int num, len;
+       char *val, *path, *domid, **e;
+
+       e     = NULL;
+       domid = NULL;
+
+       if ((xth = xs_transaction_start(h)) == XBT_NULL) {
+               EPRINTF("unable to start xs trasanction\n");
+               return NULL;
+       }
+
+       e = xs_directory(h, xth, "/local/domain", &num);
+       if (e == NULL)
+               goto done;
+
+       for (i = 0; (i < num) && (domid == NULL); i++) {
+               if (asprintf(&path, "/local/domain/%s/name", e[i]) == -1)
+                       break;
+
+               val = xs_read(h, xth, path, &len);
+               free(path);
+               if (val == NULL)
+                       continue;
+
+               if (strcmp(val, DOMNAME) == 0) {
+                       /* match! */
+                       if (asprintf(&path, 
+                                    "/local/domain/%s/domid", e[i]) == -1) {
+                               free(val);
+                               break;
+                       }
+                       domid = xs_read(h, xth, path, &len);
+                       free(path);
+               }
+               free(val);
+       }
+
+ done:
+       xs_transaction_end(h, xth, 0);
+       free(e);
+       return domid;
+}
+
+/*
+ * a little paranoia: we don't just trust token
+ */
+static struct xenbus_watch *find_watch(const char *token)
+{
+       int ret;
+       long nonce;
+       unsigned long addr;
+       struct xenbus_watch *i, *cmp;
+
+       ret = sscanf(token, "%lX:%lX", &addr, &nonce);
+       if (ret != 2) {
+               EPRINTF("invalid watch token %s\n", token);
+               return NULL;
+       }
+
+       cmp = (struct xenbus_watch *)addr;
+       list_for_each_entry(i, &watches, list)
+               if (i == cmp && i->nonce == nonce)
+                       return i;
+
+       return NULL;
+}
+
+/*
+ * Register callback to watch this node;
+ * like xs_watch, return 0 on failure
+ */
+int register_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch)
+{
+       /* Pointer in ascii is the token. */
+       char token[(sizeof(watch) + sizeof(long)) * 2 + 2];
+
+       /* 1-second granularity should suffice here */
+       watch->nonce = time(NULL);
+
+       sprintf(token, "%lX:%lX", (long)watch, watch->nonce);
+       if (find_watch(token)) {
+               EPRINTF("watch collision!\n");
+               return -EINVAL;
+       }
+
+       if (!xs_watch(h, watch->node, token)) {
+               EPRINTF("unable to set watch!\n");
+               return -EINVAL;
+       }
+
+       list_add(&watch->list, &watches);
+
+       return 0;
+}
+
+int unregister_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch)
+{
+       char token[(sizeof(watch) + sizeof(long)) * 2 + 2];
+
+       sprintf(token, "%lX:%lX", (long)watch, watch->nonce);
+       if (!find_watch(token)) {
+               EPRINTF("no such watch!\n");
+               return -EINVAL;
+       }
+
+       if (!xs_unwatch(h, watch->node, token))
+               EPRINTF("XENBUS Failed to release watch %s\n", watch->node);
+
+       list_del(&watch->list);
+
+       return 0;
+}
+
+/*
+ * re-register callbacks to all watches
+ */
+void reregister_xenbus_watches(struct xs_handle *h)
+{
+       struct xenbus_watch *watch;
+       char token[(sizeof(watch) + sizeof(long)) * 2 + 2];
+
+       list_for_each_entry(watch, &watches, list) {
+               sprintf(token, "%lX:%lX", (long)watch, watch->nonce);
+               xs_watch(h, watch->node, token);
+       }
+}
+
+/*
+ * based on watch_thread() 
+ */
+int xs_fire_next_watch(struct xs_handle *h)
+{
+       unsigned int num;
+       struct xenbus_watch *w;
+       char **res, *token, *node = NULL;
+
+       res = xs_read_watch(h, &num);
+       if (res == NULL) 
+               return -EAGAIN; /* in O_NONBLOCK, read_watch returns 0... */
+
+       node  = res[XS_WATCH_PATH];
+       token = res[XS_WATCH_TOKEN];
+       DPRINTF("got watch %s on %s\n", token, node);
+
+       w = find_watch(token);
+       if (w) 
+               w->callback(h, w, node);
+
+       DPRINTF("handled watch %s on %s\n", token, node);
+
+       free(res);
+
+       return 1;
+}
diff -r f210a633571c -r 1c627434605e tools/blktap2/daemon/lib/xs_api.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap2/daemon/lib/xs_api.h Tue May 26 11:52:31 2009 +0100
@@ -0,0 +1,62 @@
+/*
+ * xs_api.h
+ *
+ * (c) 2005 Andrew Warfield and Julian Chesterfield
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _XS_API_H_
+#define _XS_API_H_
+
+#include <xs.h>
+
+#include "list.h"
+
+struct xenbus_watch
+{
+        struct list_head  list;
+        char             *node;
+       void             *data;
+       long              nonce;
+        void (*callback) (struct xs_handle *h, 
+                         struct xenbus_watch *, 
+                         const  char *node);
+};
+
+int xs_gather(struct xs_handle *xs, const char *dir, ...);
+int xs_printf(struct xs_handle *h, const char *dir, const char *node, 
+             const char *fmt, ...) __attribute__((format(printf, 4, 5)));
+int xs_exists(struct xs_handle *h, const char *path);
+char *get_dom_domid(struct xs_handle *h);
+int convert_dev_name_to_num(char *name);
+
+int register_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch);
+int unregister_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch);
+void reregister_xenbus_watches(struct xs_handle *h);
+int xs_fire_next_watch(struct xs_handle *h);
+
+#endif
diff -r f210a633571c -r 1c627434605e tools/blktap2/daemon/tapdisk-channel.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap2/daemon/tapdisk-channel.c    Tue May 26 11:52:31 2009 +0100
@@ -0,0 +1,1367 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdarg.h>
+#include <sys/wait.h>
+#include <sys/ioctl.h>
+#include <sys/resource.h>
+
+#include <xs.h>
+#include "disktypes.h"
+#include "tapdisk-dispatch.h"
+
+#define TAPDISK_CHANNEL_IDLE          1
+#define TAPDISK_CHANNEL_WAIT_PID      2
+#define TAPDISK_CHANNEL_WAIT_OPEN     3
+#define TAPDISK_CHANNEL_WAIT_PAUSE    4
+#define TAPDISK_CHANNEL_WAIT_RESUME   5
+#define TAPDISK_CHANNEL_WAIT_CLOSE    6
+#define TAPDISK_CHANNEL_CLOSED        7
+
+static void tapdisk_channel_error(tapdisk_channel_t *,
+                                 const char *fmt, ...)
+  __attribute__((format(printf, 2, 3)));
+static void tapdisk_channel_fatal(tapdisk_channel_t *,
+                                 const char *fmt, ...)
+  __attribute__((format(printf, 2, 3)));
+static int tapdisk_channel_parse_params(tapdisk_channel_t *);
+static void tapdisk_channel_pause_event(struct xs_handle *,
+                                       struct xenbus_watch *,
+                                       const char *);
+
+static int
+tapdisk_channel_check_uuid(tapdisk_channel_t *channel)
+{
+       uint32_t uuid;
+       char *uuid_str;
+
+       uuid_str = xs_read(channel->xsh, XBT_NULL, channel->uuid_str, NULL);
+       if (!uuid_str)
+               return -errno;
+
+       uuid = strtoul(uuid_str, NULL, 10);
+       free(uuid_str);
+
+       if (uuid != channel->cookie)
+               return -EINVAL;
+
+       return 0;
+}
+
+static inline int
+tapdisk_channel_validate_watch(tapdisk_channel_t *channel, const char *path)
+{
+       int err, len;
+
+       len = strsep_len(path, '/', 7);
+       if (len < 0)
+               return -EINVAL;
+
+       err = tapdisk_channel_check_uuid(channel);
+       if (err)
+               return err;
+
+       if (!xs_exists(channel->xsh, path))
+               return -ENOENT;
+
+       return 0;
+}
+
+static inline int
+tapdisk_channel_validate_message(tapdisk_channel_t *channel,
+                                tapdisk_message_t *message)
+{
+       switch (message->type) {
+       case TAPDISK_MESSAGE_PID_RSP:
+               if (channel->state != TAPDISK_CHANNEL_WAIT_PID)
+                       return -EINVAL;
+               break;
+
+       case TAPDISK_MESSAGE_OPEN_RSP:
+               if (channel->state != TAPDISK_CHANNEL_WAIT_OPEN)
+                       return -EINVAL;
+               break;
+
+       case TAPDISK_MESSAGE_PAUSE_RSP:
+               if (channel->state != TAPDISK_CHANNEL_WAIT_PAUSE)
+                       return -EINVAL;
+               break;
+
+       case TAPDISK_MESSAGE_RESUME_RSP:
+               if (channel->state != TAPDISK_CHANNEL_WAIT_RESUME)
+                       return -EINVAL;
+               break;
+
+       case TAPDISK_MESSAGE_CLOSE_RSP:
+               if (channel->state != TAPDISK_CHANNEL_WAIT_CLOSE)
+                       return -EINVAL;
+               break;
+
+       case TAPDISK_MESSAGE_RUNTIME_ERROR:
+               /*
+                * runtime errors can be received at any time
+                * and should not affect the state machine
+                */
+               return 0;
+       }
+
+       channel->state = TAPDISK_CHANNEL_IDLE;
+       return 0;
+}
+
+static int
+tapdisk_channel_send_message(tapdisk_channel_t *channel,
+                            tapdisk_message_t *message, int timeout)
+{
+       fd_set writefds;
+       struct timeval tv;
+       int ret, len, offset;
+
+       tv.tv_sec  = timeout;
+       tv.tv_usec = 0;
+       offset     = 0;
+       len        = sizeof(tapdisk_message_t);
+
+       DPRINTF("%s: sending '%s' message to %d:%d\n",
+               channel->path, tapdisk_message_name(message->type),
+               channel->channel_id, channel->cookie);
+
+       if (channel->state != TAPDISK_CHANNEL_IDLE &&
+           message->type  != TAPDISK_MESSAGE_CLOSE)
+               EPRINTF("%s: writing message to non-idle channel (%d)\n",
+                       channel->path, channel->state);
+
+       while (offset < len) {
+               FD_ZERO(&writefds);
+               FD_SET(channel->write_fd, &writefds);
+
+               /* we don't bother reinitializing tv. at worst, it will wait a
+                * bit more time than expected. */
+
+               ret = select(channel->write_fd + 1,
+                            NULL, &writefds, NULL, &tv);
+               if (ret == -1)
+                       break;
+               else if (FD_ISSET(channel->write_fd, &writefds)) {
+                       ret = write(channel->write_fd,
+                                   message + offset, len - offset);
+                       if (ret <= 0)
+                               break;
+                       offset += ret;
+               } else
+                       break;
+       }
+
+       if (offset != len) {
+               EPRINTF("%s: error writing '%s' message to %d:%d\n",
+                       channel->path, tapdisk_message_name(message->type),
+                       channel->channel_id, channel->cookie);
+               return -EIO;
+       }
+
+       switch (message->type) {
+       case TAPDISK_MESSAGE_PID:
+               channel->state = TAPDISK_CHANNEL_WAIT_PID;
+               break;
+
+       case TAPDISK_MESSAGE_OPEN:
+               channel->state = TAPDISK_CHANNEL_WAIT_OPEN;
+               break;
+
+       case TAPDISK_MESSAGE_PAUSE:
+               channel->state = TAPDISK_CHANNEL_WAIT_PAUSE;
+               break;
+
+       case TAPDISK_MESSAGE_RESUME:
+               channel->state = TAPDISK_CHANNEL_WAIT_RESUME;
+               break;
+
+       case TAPDISK_MESSAGE_CLOSE:
+               channel->state = TAPDISK_CHANNEL_WAIT_CLOSE;
+               break;
+
+       default:
+               EPRINTF("%s: unrecognized message type %d\n",
+                       channel->path, message->type);
+       }
+
+       return 0;
+}
+
+static void
+__tapdisk_channel_error(tapdisk_channel_t *channel,
+                       const char *fmt, va_list ap)
+{
+       int err;
+       char *dir, *buf, *message;
+
+       err = vasprintf(&buf, fmt, ap);
+       if (err == -1) {
+               EPRINTF("failed to allocate error message\n");
+               buf = NULL;
+       }
+
+       if (buf)
+               message = buf;
+       else
+               message = "tapdisk error";
+
+       EPRINTF("%s: %s\n", channel->path, message);
+
+       err = asprintf(&dir, "%s/tapdisk-error", channel->path);
+       if (err == -1) {
+               EPRINTF("%s: failed to write %s\n", __func__, message);
+               dir = NULL;
+               goto out;
+       }
+
+       xs_write(channel->xsh, XBT_NULL, dir, message, strlen(message));
+
+out:
+       free(dir);
+       free(buf);
+}
+
+static void
+tapdisk_channel_error(tapdisk_channel_t *channel, const char *fmt, ...)
+{
+       va_list ap;
+
+       va_start(ap, fmt);
+       __tapdisk_channel_error(channel, fmt, ap);
+       va_end(ap);
+}
+
+static void
+tapdisk_channel_fatal(tapdisk_channel_t *channel, const char *fmt, ...)
+{
+       va_list ap;
+
+       va_start(ap, fmt);
+       __tapdisk_channel_error(channel, fmt, ap);
+       va_end(ap);
+
+       tapdisk_channel_close(channel);
+}
+
+static int
+tapdisk_channel_connect_backdev(tapdisk_channel_t *channel)
+{
+       int err, major, minor;
+       char *s, *path, *devname;
+
+       s       = NULL;
+       path    = NULL;
+       devname = NULL;
+
+       err = ioctl(channel->blktap_fd,
+                   BLKTAP_IOCTL_BACKDEV_SETUP, channel->minor);
+       if (err) {
+               err = -errno;
+               goto fail;
+       }
+
+       err = asprintf(&path, "%s/backdev-node", channel->path);
+       if (err == -1) {
+               path = NULL;
+               err  = -ENOMEM;
+               goto fail;
+       }
+
+       s = xs_read(channel->xsh, XBT_NULL, path, NULL);
+       if (!s) {
+               err = -errno;
+               goto fail;
+       }
+
+       err = sscanf(s, "%d:%d", &major, &minor);
+       if (err != 2) {
+               err = -EINVAL;
+               goto fail;
+       }
+
+       err = asprintf(&devname,"%s/%s%d",
+                      BLKTAP_DEV_DIR, BACKDEV_NAME, minor);
+       if (err == -1) {
+               devname = NULL;
+               err = -ENOMEM;
+               goto fail;
+       }
+
+       err = make_blktap_device(devname, major, minor, S_IFBLK | 0600);
+       if (err)
+               goto fail;
+
+       free(path);
+       err = asprintf(&path, "%s/backdev-path", channel->path);
+       if (err == -1) {
+               path = NULL;
+               err  = -ENOMEM;
+               goto fail;
+       }
+
+       err = xs_write(channel->xsh, XBT_NULL, path, devname, strlen(devname));
+       if (err == 0) {
+               err = -errno;
+               goto fail;
+       }
+
+       err = 0;
+ out:
+       free(devname);
+       free(path);
+       free(s);
+       return err;
+
+ fail:
+       EPRINTF("backdev setup failed [%d]\n", err);
+       goto out;
+}
+
+static int
+tapdisk_channel_complete_connection(tapdisk_channel_t *channel)
+{
+       int err;
+       char *path;
+
+       if (!xs_printf(channel->xsh, channel->path,
+                      "sectors", "%llu", channel->image.size)) {
+               EPRINTF("ERROR: Failed writing sectors");
+               return -errno;
+       }
+
+       if (!xs_printf(channel->xsh, channel->path,
+                      "sector-size", "%lu", channel->image.secsize)) {
+               EPRINTF("ERROR: Failed writing sector-size");
+               return -errno;
+       }
+
+       if (!xs_printf(channel->xsh, channel->path,
+                      "info", "%u", channel->image.info)) {
+               EPRINTF("ERROR: Failed writing info");
+               return -errno;
+       }
+
+       err = tapdisk_channel_connect_backdev(channel);
+       if (err)
+               goto clean;
+
+       channel->connected = 1;
+       return 0;
+
+ clean:
+       if (asprintf(&path, "%s/info", channel->path) == -1)
+               return err;
+
+       if (!xs_rm(channel->xsh, XBT_NULL, path))
+               goto clean_out;
+
+       free(path);
+       if (asprintf(&path, "%s/sector-size", channel->path) == -1)
+               return err;
+
+       if (!xs_rm(channel->xsh, XBT_NULL, path))
+               goto clean_out;
+
+       free(path);
+       if (asprintf(&path, "%s/sectors", channel->path) == -1)
+               return err;
+
+       xs_rm(channel->xsh, XBT_NULL, path);
+
+ clean_out:
+       free(path);
+       return err;
+}
+
+static int
+tapdisk_channel_send_open_request(tapdisk_channel_t *channel)
+{
+       int len;
+       tapdisk_message_t message;
+
+       memset(&message, 0, sizeof(tapdisk_message_t));
+
+       len = strlen(channel->vdi_path);
+
+       message.type              = TAPDISK_MESSAGE_OPEN;
+       message.cookie            = channel->cookie;
+       message.drivertype        = channel->drivertype;
+       message.u.params.storage  = channel->storage;
+       message.u.params.devnum   = channel->minor;
+       message.u.params.domid    = channel->domid;
+       message.u.params.path_len = len;
+       strncpy(message.u.params.path, channel->vdi_path, len);
+
+       if (channel->mode == 'r')
+               message.u.params.flags |= TAPDISK_MESSAGE_FLAG_RDONLY;
+       if (channel->shared)
+               message.u.params.flags |= TAPDISK_MESSAGE_FLAG_SHARED;
+
+       /* TODO: clean this up */
+       if (xs_exists(channel->xsh, "/local/domain/0/tapdisk/add-cache"))
+               message.u.params.flags |= TAPDISK_MESSAGE_FLAG_ADD_CACHE;
+       if (xs_exists(channel->xsh, "/local/domain/0/tapdisk/log-dirty"))
+               message.u.params.flags |= TAPDISK_MESSAGE_FLAG_LOG_DIRTY;
+
+       return tapdisk_channel_send_message(channel, &message, 2);
+}
+
+static int
+tapdisk_channel_receive_open_response(tapdisk_channel_t *channel,
+                                     tapdisk_message_t *message)
+{
+       int err;
+
+       channel->image.size    = message->u.image.sectors;
+       channel->image.secsize = message->u.image.sector_size;
+       channel->image.info    = message->u.image.info;
+
+       err = tapdisk_channel_complete_connection(channel);
+       if (err)
+               goto fail;
+
+       /* did we receive a pause request before the connection completed? */
+       if (channel->pause_needed) {
+               DPRINTF("%s: deferred pause request\n", channel->path);
+               tapdisk_channel_pause_event(channel->xsh,
+                                           &channel->pause_watch,
+                                           channel->pause_str);
+               channel->pause_needed = 0;
+       }
+
+       return 0;
+
+fail:
+       tapdisk_channel_fatal(channel,
+                             "failure completing connection: %d", err);
+       return err;
+}
+
+static int
+tapdisk_channel_send_shutdown_request(tapdisk_channel_t *channel)
+{
+       tapdisk_message_t message;
+
+       memset(&message, 0, sizeof(tapdisk_message_t));
+
+       message.type       = TAPDISK_MESSAGE_CLOSE;
+       message.drivertype = channel->drivertype;
+       message.cookie     = channel->cookie;
+
+       return tapdisk_channel_send_message(channel, &message, 2);
+}
+
+static int
+tapdisk_channel_receive_shutdown_response(tapdisk_channel_t *channel,
+                                         tapdisk_message_t *message)
+{
+       channel->open  = 0;
+       channel->state = TAPDISK_CHANNEL_CLOSED;
+       tapdisk_channel_close(channel);
+       return 0;
+}
+
+static int
+tapdisk_channel_receive_runtime_error(tapdisk_channel_t *channel,
+                                     tapdisk_message_t *message)
+{
+       tapdisk_channel_error(channel,
+                             "runtime error: %s", message->u.string.text);
+       return 0;
+}
+
+static int
+tapdisk_channel_send_pid_request(tapdisk_channel_t *channel)
+{
+       int err;
+       tapdisk_message_t message;
+
+       memset(&message, 0, sizeof(tapdisk_message_t));
+
+       message.type       = TAPDISK_MESSAGE_PID;
+       message.drivertype = channel->drivertype;
+       message.cookie     = channel->cookie;
+
+       err = tapdisk_channel_send_message(channel, &message, 2);
+
+       if (!err)
+               channel->open = 1;
+
+       return err;
+}
+
+static int
+tapdisk_channel_receive_pid_response(tapdisk_channel_t *channel,
+                                    tapdisk_message_t *message)
+{
+       int err;
+
+       channel->tapdisk_pid = message->u.tapdisk_pid;
+
+       DPRINTF("%s: tapdisk pid: %d\n", channel->path, channel->tapdisk_pid);
+
+       err = setpriority(PRIO_PROCESS, channel->tapdisk_pid, PRIO_SPECIAL_IO);
+       if (err) {
+               tapdisk_channel_fatal(channel,
+                                     "setting tapdisk priority: %d", err);
+               return err;
+       }
+
+       err = tapdisk_channel_send_open_request(channel);
+       if (err) {
+               tapdisk_channel_fatal(channel,
+                                     "sending open request: %d", err);
+               return err;
+       }
+
+       return 0;
+}
+
+static int
+tapdisk_channel_send_pause_request(tapdisk_channel_t *channel)
+{
+       tapdisk_message_t message;
+
+       memset(&message, 0, sizeof(tapdisk_message_t));
+
+       DPRINTF("pausing %s\n", channel->path);
+
+       message.type       = TAPDISK_MESSAGE_PAUSE;
+       message.drivertype = channel->drivertype;
+       message.cookie     = channel->cookie;
+
+       return tapdisk_channel_send_message(channel, &message, 2);
+}
+
+static int
+tapdisk_channel_receive_pause_response(tapdisk_channel_t *channel,
+                                      tapdisk_message_t *message)
+{
+       int err;
+
+       if (!xs_write(channel->xsh, XBT_NULL,
+                     channel->pause_done_str, "", strlen(""))) {
+               err = -errno;
+               goto fail;
+       }
+
+       return 0;
+
+fail:
+       tapdisk_channel_fatal(channel,
+                             "failure receiving pause response: %d\n", err);
+       return err;
+}
+
+static int
+tapdisk_channel_send_resume_request(tapdisk_channel_t *channel)
+{
+       int len;
+       tapdisk_message_t message;
+
+       memset(&message, 0, sizeof(tapdisk_message_t));
+
+       len = strlen(channel->vdi_path);
+
+       DPRINTF("resuming %s\n", channel->path);
+
+       message.type              = TAPDISK_MESSAGE_RESUME;
+       message.drivertype        = channel->drivertype;
+       message.cookie            = channel->cookie;
+       message.u.params.path_len = len;
+       strncpy(message.u.params.path, channel->vdi_path, len);
+
+       return tapdisk_channel_send_message(channel, &message, 2);
+}
+
+static int
+tapdisk_channel_receive_resume_response(tapdisk_channel_t *channel,
+                                       tapdisk_message_t *message)
+{
+       int err;
+
+       if (!xs_rm(channel->xsh, XBT_NULL, channel->pause_done_str)) {
+               err = -errno;
+               goto fail;
+       }
+
+       return 0;
+
+fail:
+       tapdisk_channel_fatal(channel,
+                             "failure receiving pause response: %d", err);
+       return err;
+}
+
+static void
+tapdisk_channel_shutdown_event(struct xs_handle *xsh,
+                              struct xenbus_watch *watch, const char *path)
+{
+       int err;
+       tapdisk_channel_t *channel;
+
+       channel = watch->data;
+
+       DPRINTF("%s: got watch on %s\n", channel->path, path);
+
+       if (!xs_exists(channel->xsh, channel->path)) {
+               tapdisk_channel_close(channel);
+               return;
+       }
+
+       err = tapdisk_channel_validate_watch(channel, path);
+       if (err) {
+               if (err == -EINVAL)
+                       tapdisk_channel_fatal(channel, "bad shutdown watch");
+               return;
+       }
+
+       tapdisk_channel_send_shutdown_request(channel);
+}
+
+static void
+tapdisk_channel_pause_event(struct xs_handle *xsh,
+                           struct xenbus_watch *watch, const char *path)
+{
+       int err, paused;
+       tapdisk_channel_t *channel;
+
+       channel = watch->data;
+
+       DPRINTF("%s: got watch on %s\n", channel->path, path);
+
+       if (!xs_exists(channel->xsh, channel->path)) {
+               tapdisk_channel_close(channel);
+               return;
+       }
+
+       /* NB: The VBD is essentially considered ready since the
+        * backend hotplug event ocurred, which is just after
+        * start-tapdisk, not after watch registration. We start
+        * testing xenstore keys with the very first shot, but defer
+        * until after connection completion. */
+
+       err = tapdisk_channel_validate_watch(channel, path);
+       if (err) {
+               if (err == -EINVAL)
+                       tapdisk_channel_fatal(channel, "bad pause watch");
+
+               if (err != -ENOENT)
+                       return;
+
+               err = 0;
+       }
+
+       paused  = xs_exists(xsh, channel->pause_done_str);
+
+       if (xs_exists(xsh, channel->pause_str)) {
+               /*
+                * Duplicate requests are a protocol validation, but
+                * impossible to identify if watch registration and an
+                * actual pause request may fire separately in close
+                * succession. Warn, but do not signal an error.
+                */
+               int pausing = channel->state == TAPDISK_CHANNEL_WAIT_PAUSE;
+               if (pausing || paused) {
+                       DPRINTF("Ignoring pause event for %s vbd %s\n",
+                               pausing ? "pausing" : "paused", channel->path);
+                       goto out;
+               }
+
+               /* defer if tapdisk is not ready yet */
+               if (!channel->connected) {
+                       DPRINTF("%s: deferring pause request\n", path);
+                       channel->pause_needed = 1;
+                       goto out;
+               }
+
+               err = tapdisk_channel_send_pause_request(channel);
+
+       } else if (xs_exists(xsh, channel->pause_done_str)) {
+               free(channel->params);
+               channel->params   = NULL;
+               channel->vdi_path = NULL;
+
+               err = xs_gather(channel->xsh, channel->path,
+                               "params", NULL, &channel->params, NULL);
+               if (err) {
+                       EPRINTF("failure re-reading params: %d\n", err);
+                       channel->params = NULL;
+                       goto out;
+               }
+
+               err = tapdisk_channel_parse_params(channel);
+               if (err)
+                       goto out;
+
+               err = tapdisk_channel_send_resume_request(channel);
+               if (err)
+                       goto out;
+       }
+
+       err = 0;
+
+out:
+       if (err)
+               tapdisk_channel_error(channel, "pause event failed: %d", err);
+}
+
+static int
+tapdisk_channel_open_control_socket(char *devname)
+{
+       int err, fd;
+       fd_set socks;
+       struct timeval timeout;
+
+       err = mkdir(BLKTAP_CTRL_DIR, 0755);
+       if (err == -1 && errno != EEXIST) {
+               EPRINTF("Failure creating %s directory: %d\n",
+                       BLKTAP_CTRL_DIR, errno);
+               return -errno;
+       }
+
+       err = mkfifo(devname, S_IRWXU | S_IRWXG | S_IRWXO);
+       if (err) {
+               if (errno == EEXIST) {
+                       /*
+                        * Remove fifo since it may have data from
+                        * it's previous use --- earlier invocation
+                        * of tapdisk may not have read all messages.
+                        */
+                       err = unlink(devname);
+                       if (err) {
+                               EPRINTF("ERROR: unlink(%s) failed (%d)\n",
+                                       devname, errno);
+                               return -errno;
+                       }
+
+                       err = mkfifo(devname, S_IRWXU | S_IRWXG | S_IRWXO);
+               }
+
+               if (err) {
+                       EPRINTF("ERROR: pipe failed (%d)\n", errno);
+                       return -errno;
+               }
+       }
+
+       fd = open(devname, O_RDWR | O_NONBLOCK);
+       if (fd == -1) {
+               EPRINTF("Failed to open %s\n", devname);
+               return -errno;
+       }
+
+       return fd;
+}
+
+static int
+tapdisk_channel_get_device_number(tapdisk_channel_t *channel)
+{
+       char *devname;
+       domid_translate_t tr;
+       int major, minor, err;
+
+       tr.domid = channel->domid;
+        tr.busid = channel->busid;
+
+       minor = ioctl(channel->blktap_fd, BLKTAP_IOCTL_NEWINTF, tr);
+       if (minor <= 0 || minor > MAX_TAP_DEV) {
+               EPRINTF("invalid dev id: %d\n", minor);
+               return -EINVAL;
+       }
+
+       major = ioctl(channel->blktap_fd, BLKTAP_IOCTL_MAJOR, minor);
+       if (major < 0) {
+               EPRINTF("invalid major id: %d\n", major);
+               return -EINVAL;
+       }
+
+       err = asprintf(&devname, "%s/%s%d",
+                      BLKTAP_DEV_DIR, BLKTAP_DEV_NAME, minor);
+       if (err == -1) {
+               EPRINTF("get_new_dev: malloc failed\n");
+               return -ENOMEM;
+       }
+
+       err = make_blktap_device(devname, major, minor, S_IFCHR | 0600);
+       free(devname);
+
+       if (err)
+               return err;
+
+       DPRINTF("Received device id %d and major %d, "
+               "sent domid %d and be_id %d\n",
+               minor, major, tr.domid, tr.busid);
+
+       channel->major = major;
+       channel->minor = minor;
+
+       return 0;
+}
+
+static int
+tapdisk_channel_start_process(tapdisk_channel_t *channel,
+                             char *write_dev, char *read_dev)
+{
+       pid_t child;
+       char *argv[] = { "tapdisk", write_dev, read_dev, NULL };
+
+       if ((child = fork()) == -1)
+               return -errno;
+
+       if (!child) {
+               int i;
+               for (i = 0 ; i < sysconf(_SC_OPEN_MAX) ; i++)
+                       if (i != STDIN_FILENO &&
+                           i != STDOUT_FILENO &&
+                           i != STDERR_FILENO)
+                               close(i);
+
+               execvp("tapdisk", argv);
+               _exit(1);
+       } else {
+               pid_t got;
+               do {
+                       got = waitpid(child, NULL, 0);
+               } while (got != child);
+       }
+       return 0;
+}
+
+static int
+tapdisk_channel_launch_tapdisk(tapdisk_channel_t *channel)
+{
+       int err;
+       char *read_dev, *write_dev;
+
+       read_dev          = NULL;
+       write_dev         = NULL;
+       channel->read_fd  = -1;
+       channel->write_fd = -1;
+
+       err = tapdisk_channel_get_device_number(channel);
+       if (err)
+               return err;
+
+       err = asprintf(&write_dev,
+                      "%s/tapctrlwrite%d", BLKTAP_CTRL_DIR, channel->minor);
+       if (err == -1) {
+               err = -ENOMEM;
+               write_dev = NULL;
+               goto fail;
+       }
+
+       err = asprintf(&read_dev,
+                      "%s/tapctrlread%d", BLKTAP_CTRL_DIR, channel->minor);
+       if (err == -1) {
+               err = -ENOMEM;
+               read_dev = NULL;
+               goto fail;
+       }
+
+       channel->write_fd = tapdisk_channel_open_control_socket(write_dev);
+       if (channel->write_fd < 0) {
+               err = channel->write_fd;
+               channel->write_fd = -1;
+               goto fail;
+       }
+
+       channel->read_fd = tapdisk_channel_open_control_socket(read_dev);
+       if (channel->read_fd < 0) {
+               err = channel->read_fd;
+               channel->read_fd = -1;
+               goto fail;
+       }
+
+       err = tapdisk_channel_start_process(channel, write_dev, read_dev);
+       if (err)
+               goto fail;
+
+       channel->open       = 1;
+       channel->channel_id = channel->write_fd;
+
+       free(read_dev);
+       free(write_dev);
+
+       DPRINTF("process launched, channel = %d:%d\n",
+               channel->channel_id, channel->cookie);
+
+       return tapdisk_channel_send_pid_request(channel);
+
+fail:
+       free(read_dev);
+       free(write_dev);
+       if (channel->read_fd != -1)
+               close(channel->read_fd);
+       if (channel->write_fd != -1)
+               close(channel->write_fd);
+       return err;
+}
+
+static int
+tapdisk_channel_connect(tapdisk_channel_t *channel)
+{
+       int err;
+
+       tapdisk_daemon_find_channel(channel);
+
+       if (!channel->tapdisk_pid)
+               return tapdisk_channel_launch_tapdisk(channel);
+
+       DPRINTF("%s: process exists: %d, channel = %d:%d\n",
+               channel->path, channel->tapdisk_pid,
+               channel->channel_id, channel->cookie);
+
+       err = tapdisk_channel_get_device_number(channel);
+       if (err)
+               return err;
+
+       return tapdisk_channel_send_pid_request(channel);
+}
+
+static int
+tapdisk_channel_init(tapdisk_channel_t *channel)
+{
+       int err;
+
+       channel->uuid_str          = NULL;
+       channel->pause_str         = NULL;
+       channel->pause_done_str    = NULL;
+       channel->shutdown_str      = NULL;
+       channel->share_tapdisk_str = NULL;
+
+       err = asprintf(&channel->uuid_str,
+                      "%s/tapdisk-uuid", channel->path);
+       if (err == -1) {
+               channel->uuid_str = NULL;
+               goto fail;
+       }
+
+       err = asprintf(&channel->pause_str, "%s/pause", channel->path);
+       if (err == -1) {
+               channel->pause_str = NULL;
+               goto fail;
+       }
+
+       err = asprintf(&channel->pause_done_str,
+                      "%s/pause-done", channel->path);
+       if (err == -1) {
+               channel->pause_done_str = NULL;
+               goto fail;
+       }
+
+       err = asprintf(&channel->shutdown_str,
+                      "%s/shutdown-tapdisk", channel->path);
+       if (err == -1) {
+               channel->shutdown_str = NULL;
+               goto fail;
+       }
+
+       channel->share_tapdisk_str = "/local/domain/0/tapdisk/share-tapdisks";
+
+       return 0;
+
+fail:
+       free(channel->uuid_str);
+       free(channel->pause_str);
+       free(channel->pause_done_str);
+       free(channel->shutdown_str);
+       channel->uuid_str          = NULL;
+       channel->pause_str         = NULL;
+       channel->pause_done_str    = NULL;
+       channel->shutdown_str      = NULL;
+       channel->share_tapdisk_str = NULL;
+       return -ENOMEM;
+}
+
+static int
+tapdisk_channel_set_watches(tapdisk_channel_t *channel)
+{
+       int err;
+
+       /* watch for pause events */
+       channel->pause_watch.node            = channel->pause_str;
+       channel->pause_watch.callback        = tapdisk_channel_pause_event;
+       channel->pause_watch.data            = channel;
+       err = register_xenbus_watch(channel->xsh, &channel->pause_watch);
+       if (err) {
+               channel->pause_watch.node    = NULL;
+               goto fail;
+       }
+
+       /* watch for shutdown events */
+       channel->shutdown_watch.node         = channel->shutdown_str;
+       channel->shutdown_watch.callback     = tapdisk_channel_shutdown_event;
+       channel->shutdown_watch.data         = channel;
+       err = register_xenbus_watch(channel->xsh, &channel->shutdown_watch);
+       if (err) {
+               channel->shutdown_watch.node = NULL;
+               goto fail;
+       }
+
+       return 0;
+
+fail:
+       if (channel->pause_watch.node) {
+               unregister_xenbus_watch(channel->xsh, &channel->pause_watch);
+               channel->pause_watch.node    = NULL;
+       }
+       if (channel->shutdown_watch.node) {
+               unregister_xenbus_watch(channel->xsh, &channel->shutdown_watch);
+               channel->shutdown_watch.node = NULL;
+       }
+       return err;
+}
+
+static void
+tapdisk_channel_get_storage_type(tapdisk_channel_t *channel)
+{
+       int err, type;
+       unsigned int len;
+       char *path, *stype;
+
+       channel->storage = TAPDISK_STORAGE_TYPE_DEFAULT;
+
+       err = asprintf(&path, "%s/sm-data/storage-type", channel->path);
+       if (err == -1)
+               return;
+
+       stype = xs_read(channel->xsh, XBT_NULL, path, &len);
+       if (!stype)
+               goto out;
+       else if (!strcmp(stype, "nfs"))
+               channel->storage = TAPDISK_STORAGE_TYPE_NFS;
+       else if (!strcmp(stype, "ext"))
+               channel->storage = TAPDISK_STORAGE_TYPE_EXT;
+       else if (!strcmp(stype, "lvm"))
+               channel->storage = TAPDISK_STORAGE_TYPE_LVM;
+
+out:
+       free(path);
+       free(stype);
+}
+
+static int
+tapdisk_channel_get_busid(tapdisk_channel_t *channel)
+{
+       int len, end;
+       const char *ptr;
+       char *tptr, num[10];
+
+       len = strsep_len(channel->path, '/', 6);
+       end = strlen(channel->path);
+       if(len < 0 || end < 0) {
+               EPRINTF("invalid path: %s\n", channel->path);
+               return -EINVAL;
+       }
+       
+       ptr = channel->path + len + 1;
+       strncpy(num, ptr, end - len);
+       tptr = num + (end - (len + 1));
+       *tptr = '\0';
+
+       channel->busid = atoi(num);
+       return 0;
+}
+
+static int
+tapdisk_channel_parse_params(tapdisk_channel_t *channel)
+{
+       int i, size, err;
+       unsigned int len;
+       char *ptr, *path, handle[10];
+       char *vdi_type;
+       char *vtype;
+
+       path = channel->params;
+       size = sizeof(dtypes) / sizeof(disk_info_t *);
+
+       if (strlen(path) + 1 >= TAPDISK_MESSAGE_MAX_PATH_LENGTH)
+               goto fail;
+
+       ptr = strchr(path, ':');
+       if (!ptr)
+               goto fail;
+
+       channel->vdi_path = ptr + 1;
+       memcpy(handle, path, (ptr - path));
+       ptr  = handle + (ptr - path);
+       *ptr = '\0';
+
+       err = asprintf(&vdi_type, "%s/sm-data/vdi-type", channel->path);
+       if (err == -1)
+               goto fail;
+
+       if (xs_exists(channel->xsh, vdi_type)) {
+               vtype = xs_read(channel->xsh, XBT_NULL, vdi_type, &len);
+               free(vdi_type);
+               if (!vtype)
+                       goto fail;
+               if (len >= sizeof(handle) - 1) {
+                       free(vtype);
+                       goto fail;
+               }
+               sprintf(handle, "%s", vtype);
+               free(vtype);
+       }
+
+       for (i = 0; i < size; i++) {
+               if (strncmp(handle, dtypes[i]->handle, (ptr - path)))
+                       continue;
+
+               if (dtypes[i]->idnum == -1)
+                       goto fail;
+
+               channel->drivertype = dtypes[i]->idnum;
+               return 0;
+       }
+
+fail:
+       EPRINTF("%s: invalid blktap params: %s\n",
+               channel->path, channel->params);
+       channel->vdi_path = NULL;
+       return -EINVAL;
+}
+
+static int
+tapdisk_channel_gather_info(tapdisk_channel_t *channel)
+{
+       int err;
+
+       err = xs_gather(channel->xsh, channel->path,
+                       "frontend", NULL, &channel->frontpath,
+                       "frontend-id", "%li", &channel->domid,
+                       "params", NULL, &channel->params,
+                       "mode", "%c", &channel->mode, NULL);
+       if (err) {
+               EPRINTF("could not find device info: %d\n", err);
+               return err;
+       }
+
+       err = tapdisk_channel_parse_params(channel);
+       if (err)
+               return err;
+
+       err = tapdisk_channel_get_busid(channel);
+       if (err)
+               return err;
+
+       tapdisk_channel_get_storage_type(channel);
+
+       return 0;
+}
+
+static int
+tapdisk_channel_verify_start_request(tapdisk_channel_t *channel)
+{
+       char *path;
+       unsigned int err;
+
+       err = asprintf(&path, "%s/start-tapdisk", channel->path);
+       if (err == -1)
+               goto mem_fail;
+
+       if (!xs_exists(channel->xsh, path))
+               goto fail;
+
+       free(path);
+       err = asprintf(&path, "%s/shutdown-request", channel->path);
+       if (err == -1)
+               goto mem_fail;
+
+       if (xs_exists(channel->xsh, path))
+               goto fail;
+
+       if (xs_exists(channel->xsh, channel->shutdown_str))
+               goto fail;
+
+       free(path);
+       err = asprintf(&path, "%s/shutdown-done", channel->path);
+       if (err == -1)
+               goto mem_fail;
+
+       if (xs_exists(channel->xsh, path))
+               goto fail;
+
+       free(path);
+
+       return 0;
+
+fail:
+       free(path);
+       EPRINTF("%s:%s: invalid start request\n", __func__, channel->path);
+       return -EINVAL;
+
+mem_fail:
+       EPRINTF("%s:%s: out of memory\n", __func__, channel->path);
+       return -ENOMEM;
+}
+
+void
+tapdisk_channel_close(tapdisk_channel_t *channel)
+{
+       if (channel->channel_id)
+               DPRINTF("%s: closing channel %d:%d\n",
+                       channel->path, channel->channel_id, channel->cookie);
+
+       if (channel->open)
+               tapdisk_channel_send_shutdown_request(channel);
+
+       if (channel->pause_watch.node) {
+               unregister_xenbus_watch(channel->xsh, &channel->pause_watch);
+               channel->pause_watch.node = NULL;
+       }
+
+       if (channel->shutdown_watch.node) {
+               unregister_xenbus_watch(channel->xsh, &channel->shutdown_watch);
+               channel->shutdown_watch.node = NULL;
+       }
+
+       tapdisk_daemon_close_channel(channel);
+
+       free(channel->params);
+       free(channel->frontpath);
+       free(channel->shutdown_str);
+       free(channel->pause_done_str);
+       free(channel->pause_str);
+       free(channel->uuid_str);
+       free(channel->path);
+       free(channel);
+}
+
+int
+tapdisk_channel_open(tapdisk_channel_t **_channel,
+                    char *path, struct xs_handle *xsh,
+                    int blktap_fd, uint16_t cookie)
+{
+       int err;
+       char *msg;
+       tapdisk_channel_t *channel;
+
+       msg       = NULL;
+       *_channel = NULL;
+
+       channel = calloc(1, sizeof(tapdisk_channel_t));
+       if (!channel)
+               return -ENOMEM;
+
+       channel->xsh       = xsh;
+       channel->blktap_fd = blktap_fd;
+       channel->cookie    = cookie;
+       channel->state     = TAPDISK_CHANNEL_IDLE;
+
+       INIT_LIST_HEAD(&channel->list);
+
+       channel->path = strdup(path);
+       if (!channel->path) {
+               err = -ENOMEM;
+               goto fail;
+       }
+
+       err = tapdisk_channel_init(channel);
+       if (err) {
+               msg = "allocating device";
+               goto fail;
+       }
+
+       err = tapdisk_channel_check_uuid(channel);
+       if (err) {
+               msg = "checking uuid";
+               goto fail;
+       }
+
+       err = tapdisk_channel_gather_info(channel);
+       if (err) {
+               msg = "gathering parameters";
+               goto fail;
+       }
+
+       err = tapdisk_channel_verify_start_request(channel);
+       if (err) {
+               msg = "invalid start request";
+               goto fail;
+       }
+
+       err = tapdisk_channel_set_watches(channel);
+       if (err) {
+               msg = "registering xenstore watches";
+               goto fail;
+       }
+
+       err = tapdisk_channel_connect(channel);
+       if (err) {
+               msg = "connecting to tapdisk";
+               goto fail;
+       }
+
+       *_channel = channel;
+       return 0;
+
+fail:
+       tapdisk_channel_fatal(channel, "%s: %d", (msg ? : "failure"), err);
+       return err;
+}
+
+int
+tapdisk_channel_receive_message(tapdisk_channel_t *c, tapdisk_message_t *m)
+{
+       int err;
+
+       err = tapdisk_channel_validate_message(c, m);
+       if (err)
+               goto fail;
+
+       switch (m->type) {
+       case TAPDISK_MESSAGE_PID_RSP:
+               return tapdisk_channel_receive_pid_response(c, m);
+
+       case TAPDISK_MESSAGE_OPEN_RSP:
+               return tapdisk_channel_receive_open_response(c, m);
+
+       case TAPDISK_MESSAGE_PAUSE_RSP:
+               return tapdisk_channel_receive_pause_response(c, m);
+
+       case TAPDISK_MESSAGE_RESUME_RSP:
+               return tapdisk_channel_receive_resume_response(c, m);
+
+       case TAPDISK_MESSAGE_CLOSE_RSP:
+               return tapdisk_channel_receive_shutdown_response(c, m);
+
+       case TAPDISK_MESSAGE_RUNTIME_ERROR:
+               return tapdisk_channel_receive_runtime_error(c, m);
+       }
+
+fail:
+       tapdisk_channel_fatal(c, "received unexpected message %s in state %d",
+                             tapdisk_message_name(m->type), c->state);
+       return -EINVAL;
+}
diff -r f210a633571c -r 1c627434605e tools/blktap2/daemon/tapdisk-daemon.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap2/daemon/tapdisk-daemon.c     Tue May 26 11:52:31 2009 +0100
@@ -0,0 +1,599 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include <xs.h>
+#include "disktypes.h"
+#include "tapdisk-dispatch.h"
+
+#define TAPDISK_DAEMON_DOMID_WATCH   "domid-watch"
+#define TAPDISK_DAEMON_PIDFILE       "/var/run/blktapctrl.pid"
+
+typedef struct tapdisk_daemon {
+       char                         *node;
+       int                           blktap_fd;
+       uint16_t                      cookie;
+
+       struct xs_handle             *xsh;
+       struct list_head              channels;
+       struct xenbus_watch           watch;
+} tapdisk_daemon_t;
+
+static tapdisk_daemon_t tapdisk_daemon;
+
+#define tapdisk_daemon_for_each_channel(c, tmp) \
+       list_for_each_entry_safe(c, tmp, &tapdisk_daemon.channels, list)
+
+#define MAX(a, b) ((a) >= (b) ? (a) : (b))
+
+static void
+tapdisk_daemon_print_drivers(void)
+{
+       int i, size;
+
+       DPRINTF("blktap-daemon: v1.0.2\n");
+
+       size = sizeof(dtypes) / sizeof(disk_info_t *);
+       for (i = 0; i < size; i++)
+               DPRINTF("Found driver: [%s]\n", dtypes[i]->name);
+}
+
+static int
+tapdisk_daemon_write_pidfile(long pid)
+{
+       char buf[100];
+       int len, fd, flags, err;
+
+       fd = open(TAPDISK_DAEMON_PIDFILE, O_RDWR | O_CREAT, 0600);
+       if (fd == -1) {
+               EPRINTF("Opening pid file failed (%d)\n", errno);
+               return -errno;
+       }
+
+       /* We exit silently if daemon already running */
+       err = lockf(fd, F_TLOCK, 0);
+       if (err == -1)
+               exit(0);
+
+       /* Set FD_CLOEXEC, so that tapdisk doesn't get this file descriptor */
+       flags = fcntl(fd, F_GETFD);
+       if (flags == -1) {
+               EPRINTF("F_GETFD failed (%d)\n", errno);
+               return -errno;
+       }
+
+       flags |= FD_CLOEXEC;
+       err = fcntl(fd, F_SETFD, flags);
+       if (err == -1) {
+               EPRINTF("F_SETFD failed (%d)\n", errno);
+               return -errno;
+       }
+
+       len = sprintf(buf, "%ld\n", pid);
+       err = write(fd, buf, len);
+       if (err != len) {
+               EPRINTF("Writing pid file failed (%d)\n", errno);
+               return -errno;
+       }
+
+       return 0;
+}
+
+static int
+tapdisk_daemon_init(void)
+{
+       char *devname;
+       int i, err, blktap_major;
+
+       memset(&tapdisk_daemon, 0, sizeof(tapdisk_daemon_t));
+
+       err = asprintf(&devname, "%s/%s0", BLKTAP_DEV_DIR, BLKTAP_DEV_NAME);
+       if (err == -1) {
+               devname = NULL;
+               err = -ENOMEM;
+               goto fail;
+       }
+
+       err = xc_find_device_number("blktap0");
+       if (err < 0)
+               goto fail;
+
+       blktap_major = major(err);
+       err = make_blktap_device(devname, blktap_major, 0, S_IFCHR | 0600);
+       if (err)
+               goto fail;
+
+       tapdisk_daemon.blktap_fd = open(devname, O_RDWR);
+       if (tapdisk_daemon.blktap_fd == -1) {
+               err = -errno;
+               EPRINTF("blktap0 open failed\n");
+               goto fail;
+       }
+
+       for (i = 0; i < 2; i++) {
+               tapdisk_daemon.xsh = xs_daemon_open();
+               if (!tapdisk_daemon.xsh) {
+                       EPRINTF("xs_daemon_open failed -- is xenstore 
running?\n");
+                       sleep(2);
+               } else
+                       break;
+       }
+
+       if (!tapdisk_daemon.xsh) {
+               err = -ENOSYS;
+               goto fail;
+       }
+
+       INIT_LIST_HEAD(&tapdisk_daemon.channels);
+
+       free(devname);
+       return 0;
+
+fail:
+       if (tapdisk_daemon.blktap_fd > 0)
+               close(tapdisk_daemon.blktap_fd);
+       free(devname);
+       memset(&tapdisk_daemon, 0, sizeof(tapdisk_daemon_t));
+       EPRINTF("%s: %d\n", __func__, err);
+
+       return err;
+}
+
+static int
+tapdisk_daemon_set_node(void)
+{
+       int err;
+       char *domid;
+
+       domid = get_dom_domid(tapdisk_daemon.xsh);
+       if (!domid)
+               return -EAGAIN;
+
+       err = asprintf(&tapdisk_daemon.node,
+                      "/local/domain/%s/backend/tap", domid);
+       if (err == -1) {
+               tapdisk_daemon.node = NULL;
+               err = -ENOMEM;
+               goto out;
+       }
+
+       err = 0;
+
+out:
+       free(domid);
+       return err;
+}
+
+static int
+tapdisk_daemon_get_domid(void)
+{
+       int err;
+       unsigned int num;
+       char **res, *node, *token, *domid;
+
+       res = xs_read_watch(tapdisk_daemon.xsh, &num);
+       if (!res)
+               return -EAGAIN;
+
+       err   = 0;
+       node  = res[XS_WATCH_PATH];
+       token = res[XS_WATCH_TOKEN];
+
+       if (strcmp(token, TAPDISK_DAEMON_DOMID_WATCH)) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       err = tapdisk_daemon_set_node();
+
+out:
+       free(res);
+       return err;
+}
+
+static int
+tapdisk_daemon_wait_for_domid(void)
+{
+       int err;
+       char *domid;
+       fd_set readfds;
+
+       err = tapdisk_daemon_set_node();
+       if (!err)
+               return 0;
+
+       if (!xs_watch(tapdisk_daemon.xsh, "/local/domain",
+                     TAPDISK_DAEMON_DOMID_WATCH)) {
+               EPRINTF("unable to set domain id watch\n");
+               return -EINVAL;
+       }
+
+       do {
+               FD_ZERO(&readfds);
+               FD_SET(xs_fileno(tapdisk_daemon.xsh), &readfds);
+
+               select(xs_fileno(tapdisk_daemon.xsh) + 1,
+                      &readfds, NULL, NULL, NULL);
+
+               if (FD_ISSET(xs_fileno(tapdisk_daemon.xsh), &readfds))
+                       err = tapdisk_daemon_get_domid();
+               else
+                       err = -EAGAIN;
+       } while (err == -EAGAIN);
+
+       xs_unwatch(tapdisk_daemon.xsh,
+                  "/local/domain", TAPDISK_DAEMON_DOMID_WATCH);
+       return err;
+}
+
+static inline int
+tapdisk_daemon_new_vbd_event(const char *node)
+{
+       return (!strcmp(node, "start-tapdisk"));
+}
+
+static int
+tapdisk_daemon_write_uuid(char *path, uint32_t uuid)
+{
+       int err;
+       char *cpath, uuid_str[12];
+
+       snprintf(uuid_str, sizeof(uuid_str), "%u", uuid);
+
+       err = asprintf(&cpath, "%s/tapdisk-uuid", path);
+       if (err == -1)
+               return -ENOMEM;
+
+       err = xs_write(tapdisk_daemon.xsh, XBT_NULL,
+                      cpath, uuid_str, strlen(uuid_str));
+       free(cpath);
+
+       return (err ? 0 : -errno);
+}
+
+static void
+tapdisk_daemon_probe(struct xs_handle *xsh,
+                    struct xenbus_watch *watch, const char *path)
+{
+       char *cpath;
+       int len, err;
+       uint32_t cookie;
+       const char *node;
+       tapdisk_channel_t *channel;
+
+       len = strsep_len(path, '/', 7);
+       if (len < 0)
+               return;
+
+       node = path + len + 1;
+
+       if (!tapdisk_daemon_new_vbd_event(node))
+               return;
+
+       if (!xs_exists(xsh, path))
+               return;
+
+       cpath = strdup(path);
+       if (!cpath) {
+               EPRINTF("failed to allocate control path for %s\n", path);
+               return;
+       }
+       cpath[len] = '\0';
+
+       cookie = tapdisk_daemon.cookie++;
+       err    = tapdisk_daemon_write_uuid(cpath, cookie);
+       if (err)
+               goto out;
+
+       DPRINTF("%s: got watch on %s, uuid = %u\n", __func__, path, cookie);
+
+       err = tapdisk_channel_open(&channel, cpath,
+                                  tapdisk_daemon.xsh,
+                                  tapdisk_daemon.blktap_fd,
+                                  cookie);
+       if (!err)
+               list_add(&channel->list, &tapdisk_daemon.channels);
+       else
+               EPRINTF("failed to open tapdisk channel for %s: %d\n",
+                       path, err);
+
+out:
+       free(cpath);
+}
+
+static int
+tapdisk_daemon_start(void)
+{
+       int err;
+
+       err = tapdisk_daemon_wait_for_domid();
+       if (err)
+               return err;
+
+       tapdisk_daemon.watch.node     = tapdisk_daemon.node;
+       tapdisk_daemon.watch.callback = tapdisk_daemon_probe;
+
+       err = register_xenbus_watch(tapdisk_daemon.xsh, &tapdisk_daemon.watch);
+       if (err)
+               goto fail;
+
+       ioctl(tapdisk_daemon.blktap_fd,
+             BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE);
+       ioctl(tapdisk_daemon.blktap_fd, BLKTAP_IOCTL_SENDPID, getpid());
+
+       return 0;
+
+fail:
+       free(tapdisk_daemon.node);
+       tapdisk_daemon.node       = NULL;
+       tapdisk_daemon.watch.node = NULL;
+       EPRINTF("%s: %d\n", __func__, err);
+       return err;
+}
+
+static int
+tapdisk_daemon_stop(void)
+{
+       unregister_xenbus_watch(tapdisk_daemon.xsh, &tapdisk_daemon.watch);
+
+       ioctl(tapdisk_daemon.blktap_fd,
+             BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_PASSTHROUGH);
+       close(tapdisk_daemon.blktap_fd);
+
+       return 0;
+}
+
+static void
+tapdisk_daemon_free(void)
+{
+       free(tapdisk_daemon.node);
+       xs_daemon_close(tapdisk_daemon.xsh);
+       memset(&tapdisk_daemon, 0, sizeof(tapdisk_daemon_t));
+}
+
+static int
+tapdisk_daemon_read_message(int fd, tapdisk_message_t *message, int timeout)
+{
+       fd_set readfds;
+       struct timeval tv;
+       int ret, len, offset;
+
+       tv.tv_sec  = timeout;
+       tv.tv_usec = 0;
+       offset     = 0;
+       len        = sizeof(tapdisk_message_t);
+
+       memset(message, 0, sizeof(tapdisk_message_t));
+
+       while (offset < len) {
+               FD_ZERO(&readfds);
+               FD_SET(fd, &readfds);
+
+               /* we don't bother reinitializing tv. at worst, it will wait a
+                * bit more time than expected. */
+
+               ret = select(fd + 1, &readfds, NULL, NULL, &tv);
+               if (ret == -1)
+                       break;
+               else if (FD_ISSET(fd, &readfds)) {
+                       ret = read(fd, message + offset, len - offset);
+                       if (ret <= 0)
+                               break;
+                       offset += ret;
+               } else
+                       break;
+       }
+
+       return (offset == len ? 0 : -EIO);
+}
+
+static int
+tapdisk_daemon_receive_message(int fd)
+{
+       int err;
+       tapdisk_message_t m;
+       tapdisk_channel_t *c, *tmp;
+
+       err = tapdisk_daemon_read_message(fd, &m, 2);
+       if (err) {
+               EPRINTF("failed reading message on %d: %d\n", fd, err);
+               return err;
+       }
+
+       tapdisk_daemon_for_each_channel(c, tmp)
+               if (c->cookie == m.cookie && c->read_fd == fd) {
+                       DPRINTF("got '%s' message from %d:%d\n",
+                               tapdisk_message_name(m.type),
+                               c->channel_id, c->cookie);
+
+                       return tapdisk_channel_receive_message(c, &m);
+               }
+
+       EPRINTF("unrecognized message on %d: '%s' (uuid = %u)\n",
+               fd, tapdisk_message_name(m.type), m.cookie);
+
+       return -EINVAL;
+}
+
+static int
+tapdisk_daemon_set_fds(fd_set *readfds)
+{
+       int max, fd;
+       tapdisk_channel_t *channel, *tmp;
+
+       max = xs_fileno(tapdisk_daemon.xsh);
+
+       FD_ZERO(readfds);
+       FD_SET(max, readfds);
+
+       tapdisk_daemon_for_each_channel(channel, tmp) {
+               fd  = channel->read_fd;
+               max = MAX(fd, max);
+               FD_SET(fd, readfds);
+       }
+
+       return max;
+}
+
+static int
+tapdisk_daemon_check_fds(fd_set *readfds)
+{
+       int err;
+       tapdisk_channel_t *channel, *tmp;
+
+       if (FD_ISSET(xs_fileno(tapdisk_daemon.xsh), readfds))
+               xs_fire_next_watch(tapdisk_daemon.xsh);
+
+       tapdisk_daemon_for_each_channel(channel, tmp)
+               if (FD_ISSET(channel->read_fd, readfds))
+                       return tapdisk_daemon_receive_message(channel->read_fd);
+
+       return 0;
+}
+
+static int
+tapdisk_daemon_run(void)
+{
+       int err, max;
+       fd_set readfds;
+
+       while (1) {
+               max = tapdisk_daemon_set_fds(&readfds);
+
+               err = select(max + 1, &readfds, NULL, NULL, NULL);
+               if (err < 0)
+                       continue;
+
+               err = tapdisk_daemon_check_fds(&readfds);
+       }
+
+       return err;
+}
+
+void
+tapdisk_daemon_find_channel(tapdisk_channel_t *channel)
+{
+       tapdisk_channel_t *c, *tmp;
+
+       channel->read_fd     = 0;
+       channel->write_fd    = 0;
+       channel->tapdisk_pid = 0;
+
+       /* do we want multiple vbds per tapdisk? */
+       if (!xs_exists(tapdisk_daemon.xsh, channel->share_tapdisk_str)) {
+               channel->shared = 0;
+               return;
+       }
+
+       channel->shared = 1;
+
+       /* check if we already have a process started */
+       tapdisk_daemon_for_each_channel(c, tmp)
+               if (c->drivertype == channel->drivertype) {
+                       channel->write_fd    = c->write_fd;
+                       channel->read_fd     = c->read_fd;
+                       channel->channel_id  = c->channel_id;
+                       channel->tapdisk_pid = c->tapdisk_pid;
+                       return;
+               }
+}
+
+void
+tapdisk_daemon_close_channel(tapdisk_channel_t *channel)
+{
+       tapdisk_channel_t *c, *tmp;
+
+       list_del(&channel->list);
+
+       tapdisk_daemon_for_each_channel(c, tmp)
+               if (c->channel_id == channel->channel_id)
+                       return;
+
+       close(channel->read_fd);
+       close(channel->write_fd);
+}
+
+int
+main(int argc, char *argv[])
+{
+       int err;
+       char buf[128];
+
+       if (daemon(0, 0)) {
+         EPRINTF("daemon() failed (%d)\n", errno);
+         return -errno;
+       }
+
+#define CORE_DUMP
+#if defined(CORE_DUMP)
+#include <sys/resource.h>
+       {
+               /* set up core-dumps*/
+               struct rlimit rlim;
+               rlim.rlim_cur = RLIM_INFINITY;
+               rlim.rlim_max = RLIM_INFINITY;
+               if (setrlimit(RLIMIT_CORE, &rlim) < 0)
+                       EPRINTF("setrlimit failed: %d\n", errno);
+       }
+#endif
+
+       snprintf(buf, sizeof(buf), "BLKTAP-DAEMON[%d]", getpid());
+       openlog(buf, LOG_CONS | LOG_ODELAY, LOG_DAEMON);
+
+       err = tapdisk_daemon_write_pidfile(getpid());
+       if (err)
+               goto out;
+
+       tapdisk_daemon_print_drivers();
+
+       err = tapdisk_daemon_init();
+       if (err)
+               goto out;
+
+       err = tapdisk_daemon_start();
+       if (err)
+               goto out;
+
+       tapdisk_daemon_run();
+
+       tapdisk_daemon_stop();
+       tapdisk_daemon_free();
+
+       err = 0;
+
+out:
+       if (err)
+               EPRINTF("failed to start %s: %d\n", argv[0], err);
+       closelog();
+       return err;
+}
diff -r f210a633571c -r 1c627434605e 
tools/blktap2/daemon/tapdisk-dispatch-common.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap2/daemon/tapdisk-dispatch-common.c    Tue May 26 11:52:31 
2009 +0100
@@ -0,0 +1,94 @@
+/*
+ * (c) 2005 Andrew Warfield and Julian Chesterfield
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "tapdisk-dispatch.h"
+
+int
+strsep_len(const char *str, char c, unsigned int len)
+{
+       unsigned int i;
+       
+       for (i = 0; str[i]; i++)
+               if (str[i] == c) {
+                       if (len == 0)
+                               return i;
+                       len--;
+               }
+
+       return (len == 0) ? i : -ERANGE;
+}
+
+int
+make_blktap_device(char *devname, int major, int minor, int perm)
+{
+       int err;
+
+       err = unlink(devname);
+       if (err && errno != ENOENT) {
+               EPRINTF("unlink %s failed: %d\n", devname, errno);
+               return -errno;
+       }
+
+       /* Need to create device */
+       err = mkdir(BLKTAP_DEV_DIR, 0755);
+       if (err && errno != EEXIST) {
+               EPRINTF("Failed to create %s directory\n", BLKTAP_DEV_DIR);
+               return -errno;
+       }
+
+       err = mknod(devname, perm, makedev(major, minor));
+       if (err) {
+               int ret = -errno;
+               struct stat st;
+
+               EPRINTF("mknod %s failed: %d\n", devname, -errno);
+
+               err = lstat(devname, &st);
+               if (err) {
+                       DPRINTF("lstat %s failed: %d\n", devname, -errno);
+                       err = access(devname, F_OK);
+                       if (err)
+                               DPRINTF("access %s failed: %d\n", devname, 
-errno);
+                       else
+                               DPRINTF("access %s succeeded\n", devname);
+               } else
+                       DPRINTF("lstat %s: %u:%u\n", devname,
+                               (unsigned int)st.st_rdev >> 8,
+                               (unsigned int)st.st_rdev & 0xff);
+
+               return ret;
+       }
+
+       DPRINTF("Created %s device\n", devname);
+       return 0;
+}
diff -r f210a633571c -r 1c627434605e tools/blktap2/daemon/tapdisk-dispatch.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap2/daemon/tapdisk-dispatch.h   Tue May 26 11:52:31 2009 +0100
@@ -0,0 +1,95 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _TAPDISK_DISPATCH_H_
+#define _TAPDISK_DISPATCH_H_
+
+#include "xs_api.h"
+#include "blktaplib.h"
+#include "tapdisk-message.h"
+
+struct tapdisk_channel {
+       int                       state;
+
+       int                       read_fd;
+       int                       write_fd;
+       int                       blktap_fd;
+       int                       channel_id;
+
+       char                      mode;
+       char                      shared;
+       char                      open;
+       unsigned int              domid;
+       unsigned int              busid;
+       unsigned int              major;
+       unsigned int              minor;
+       unsigned int              storage;
+       unsigned int              drivertype;
+       uint16_t                  cookie;
+       pid_t                     tapdisk_pid;
+
+       /*
+        * special accounting needed to handle pause
+        * requests received before tapdisk process is ready
+        */
+       char                      connected;
+       char                      pause_needed;
+
+       char                     *path;
+       char                     *frontpath;
+       char                     *params;
+       char                     *vdi_path;
+       char                     *uuid_str;
+       char                     *pause_str;
+       char                     *pause_done_str;
+       char                     *shutdown_str;
+       char                     *share_tapdisk_str;
+
+       image_t                   image;
+
+       struct list_head          list;
+       struct xenbus_watch       pause_watch;
+       struct xenbus_watch       shutdown_watch;
+
+       struct xs_handle         *xsh;
+};
+
+typedef struct tapdisk_channel tapdisk_channel_t;
+
+int strsep_len(const char *str, char c, unsigned int len);
+int make_blktap_device(char *devname, int major, int minor, int perm);
+
+int tapdisk_channel_open(tapdisk_channel_t **,
+                        char *node, struct xs_handle *,
+                        int blktap_fd, uint16_t cookie);
+void tapdisk_channel_close(tapdisk_channel_t *);
+
+void tapdisk_daemon_find_channel(tapdisk_channel_t *);
+void tapdisk_daemon_close_channel(tapdisk_channel_t *);
+
+int tapdisk_channel_receive_message(tapdisk_channel_t *, tapdisk_message_t *);
+
+#endif
diff -r f210a633571c -r 1c627434605e tools/blktap2/drivers/Makefile
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap2/drivers/Makefile    Tue May 26 11:52:31 2009 +0100
@@ -0,0 +1,105 @@
+XEN_ROOT=../../../
+BLKTAP_ROOT= ..
+include $(XEN_ROOT)/tools/Rules.mk
+
+LIBVHDDIR  = $(BLKTAP_ROOT)/vhd/lib
+
+IBIN       = tapdisk tapdisk2 td-util tapdisk-client tapdisk-stream 
tapdisk-diff
+QCOW_UTIL  = img2qcow qcow-create qcow2raw
+LOCK_UTIL  = lock-util
+INST_DIR   = $(SBINDIR)
+
+CFLAGS    += -Werror -g -O0
+CFLAGS    += -Wno-unused
+CFLAGS    += -fno-strict-aliasing
+CFLAGS    += -I../lib -I../../libxc
+CFLAGS    += -I../include -I../../include
+CFLAGS    += -I $(LIBAIO_DIR)
+CFLAGS    += -D_GNU_SOURCE
+CFLAGS    += -DUSE_NFS_LOCKS
+
+ifeq ($(CONFIG_X86_64),y)
+CFLAGS            += -fPIC
+endif
+
+LIBS      += -lrt -lz
+
+ifeq ($(shell . ./check_gcrypt $(CC)),yes)
+CFLAGS += -DUSE_GCRYPT
+CRYPT_LIB += -lgcrypt
+else
+CRYPT_LIB += -lcrypto
+$(warning === libgcrypt not installed: falling back to libcrypto ===)
+endif
+
+LDFLAGS_img := $(CRYPT_LIB) -lpthread -lz
+
+tapdisk tapdisk2 td-util tapdisk-stream tapdisk-diff $(QCOW_UTIL): LIBS += 
-L$(LIBVHDDIR) -lvhd -luuid
+
+LIBAIO_DIR = $(XEN_ROOT)/tools/libaio/src
+tapdisk tapdisk2 tapdisk-stream tapdisk-diff $(QCOW_UTIL): AIOLIBS := 
$(LIBAIO_DIR)/libaio.a
+tapdisk tapdisk-client tapdisk-stream tapdisk-diff $(QCOW_UTIL): CFLAGS  += 
-I$(LIBAIO_DIR) -I$(XEN_LIBXC)
+
+ifeq ($(VHD_STATIC),y)
+td-util: CFLAGS += -static
+endif
+
+TAP-OBJS-y  := scheduler.o
+TAP-OBJS-y  += tapdisk-ipc.o
+TAP-OBJS-y  += tapdisk-vbd.o
+TAP-OBJS-y  += tapdisk-image.o
+TAP-OBJS-y  += tapdisk-driver.o
+TAP-OBJS-y  += tapdisk-interface.o
+TAP-OBJS-y  += tapdisk-server.o
+TAP-OBJS-y  += tapdisk-queue.o
+TAP-OBJS-y  += tapdisk-filter.o
+TAP-OBJS-y  += tapdisk-log.o
+TAP-OBJS-y  += tapdisk-utils.o
+TAP-OBJS-y  += io-optimize.o
+TAP-OBJS-y  += lock.o
+TAP-OBJS-$(CONFIG_Linux)  += blk_linux.o
+
+MISC-OBJS-y := atomicio.o
+
+BLK-OBJS-y  := block-aio.o
+BLK-OBJS-y  += block-ram.o
+BLK-OBJS-y  += block-cache.o
+BLK-OBJS-y  += block-vhd.o
+BLK-OBJS-y  += block-log.o
+BLK-OBJS-y  += block-qcow.o
+BLK-OBJS-y  += aes.o
+
+all: $(IBIN) lock-util qcow-util
+
+tapdisk: $(TAP-OBJS-y) $(BLK-OBJS-y) $(MISC-OBJS-y) tapdisk.c
+       $(CC) $(CFLAGS) -o $@ $^ $(LIBS) $(AIOLIBS)  $(LDFLAGS_img)
+
+tapdisk2: $(TAP-OBJS-y) $(BLK-OBJS-y) $(MISC-OBJS-y) tapdisk2.c
+       $(CC) $(CFLAGS) -o $@ $^ $(LIBS) $(AIOLIBS) $(LDFLAGS_img)
+
+tapdisk-client: tapdisk-client.o
+       $(CC) $(CFLAGS) -o $@ $^ $(LIBS)  $(LDFLAGS_img)
+
+tapdisk-stream tapdisk-diff: %: %.o $(TAP-OBJS-y) $(BLK-OBJS-y)
+       $(CC) $(CFLAGS) -o $@ $^ $(LIBS) $(AIOLIBS)  $(LDFLAGS_img)
+
+td-util: td.o tapdisk-utils.o tapdisk-log.o
+       $(CC) $(CFLAGS) -o $@ $^ $(LIBS)  $(LDFLAGS_img)
+
+lock-util: lock.c
+       $(CC) $(CFLAGS) -DUTIL -o lock-util lock.c $(LIBS)
+
+.PHONY: qcow-util
+qcow-util: img2qcow qcow2raw qcow-create
+
+img2qcow qcow2raw qcow-create: %: %.o $(TAP-OBJS-y) $(BLK-OBJS-y)
+       $(CC) $(CFLAGS) -o $@ $^ $(LIBS) $(AIOLIBS) $(LDFLAGS_img)
+
+install: all
+       $(INSTALL_DIR) -p $(DESTDIR)$(INST_DIR)
+       $(INSTALL_PROG) $(IBIN) $(LOCK_UTIL) $(QCOW_UTIL) $(DESTDIR)$(INST_DIR)
+
+clean:
+       rm -rf *.o *~ xen TAGS $(IBIN) $(LIB) $(LOCK_UTIL) $(QCOW_UTIL)
+
+.PHONY: clean install
diff -r f210a633571c -r 1c627434605e tools/blktap2/drivers/aes.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap2/drivers/aes.c       Tue May 26 11:52:31 2009 +0100
@@ -0,0 +1,1319 @@
+/**
+ * 
+ * aes.c - integrated in QEMU by Fabrice Bellard from the OpenSSL project.
+ */
+/*
+ * rijndael-alg-fst.c
+ *
+ * @version 3.0 (December 2000)
+ *
+ * Optimised ANSI C code for the Rijndael cipher (now AES)
+ *
+ * @author Vincent Rijmen <vincent.rijmen@xxxxxxxxxxxxxxxxxxx>
+ * @author Antoon Bosselaers <antoon.bosselaers@xxxxxxxxxxxxxxxxxxx>
+ * @author Paulo Barreto <paulo.barreto@xxxxxxxxxxxx>
+ *
+ * This code is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+//#include "vl.h"
+#include <inttypes.h>
+#include <string.h>
+#include "aes.h"
+
+//#define NDEBUG
+#include <assert.h>
+
+typedef uint32_t u32;
+typedef uint16_t u16;
+typedef uint8_t u8;
+
+#define MAXKC   (256/32)
+#define MAXKB   (256/8)
+#define MAXNR   14
+
+/* This controls loop-unrolling in aes_core.c */
+#undef FULL_UNROLL
+# define GETU32(pt) (((u32)(pt)[0] << 24) ^ ((u32)(pt)[1] << 16) ^ 
((u32)(pt)[2] <<  8) ^ ((u32)(pt)[3]))
+# define PUTU32(ct, st) { (ct)[0] = (u8)((st) >> 24); (ct)[1] = (u8)((st) >> 
16); (ct)[2] = (u8)((st) >>  8); (ct)[3] = (u8)(st); }
+
+/*
+Te0[x] = S [x].[02, 01, 01, 03];
+Te1[x] = S [x].[03, 02, 01, 01];
+Te2[x] = S [x].[01, 03, 02, 01];
+Te3[x] = S [x].[01, 01, 03, 02];
+Te4[x] = S [x].[01, 01, 01, 01];
+
+Td0[x] = Si[x].[0e, 09, 0d, 0b];
+Td1[x] = Si[x].[0b, 0e, 09, 0d];
+Td2[x] = Si[x].[0d, 0b, 0e, 09];
+Td3[x] = Si[x].[09, 0d, 0b, 0e];
+Td4[x] = Si[x].[01, 01, 01, 01];
+*/
+
+static const u32 Te0[256] = {
+    0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,
+    0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,
+    0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,
+    0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU,
+    0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U,
+    0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU,
+    0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU,
+    0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU,
+    0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU,
+    0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU,
+    0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U,
+    0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU,
+    0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU,
+    0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U,
+    0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU,
+    0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU,
+    0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU,
+    0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU,
+    0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU,
+    0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U,
+    0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU,
+    0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU,
+    0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU,
+    0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU,
+    0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U,
+    0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U,
+    0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U,
+    0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U,
+    0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU,
+    0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U,
+    0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U,
+    0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU,
+    0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU,
+    0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U,
+    0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U,
+    0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U,
+    0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU,
+    0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U,
+    0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU,
+    0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U,
+    0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU,
+    0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U,
+    0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U,
+    0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU,
+    0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U,
+    0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U,
+    0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U,
+    0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U,
+    0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U,
+    0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U,
+    0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U,
+    0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U,
+    0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU,
+    0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U,
+    0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U,
+    0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U,
+    0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U,
+    0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U,
+    0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U,
+    0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU,
+    0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U,
+    0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U,
+    0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U,
+    0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU,
+};
+static const u32 Te1[256] = {
+    0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU,
+    0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U,
+    0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU,
+    0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U,
+    0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU,
+    0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U,
+    0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU,
+    0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U,
+    0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U,
+    0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU,
+    0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U,
+    0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U,
+    0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U,
+    0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU,
+    0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U,
+    0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U,
+    0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU,
+    0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U,
+    0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U,
+    0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U,
+    0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU,
+    0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU,
+    0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U,
+    0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU,
+    0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU,
+    0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U,
+    0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU,
+    0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U,
+    0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU,
+    0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U,
+    0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U,
+    0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U,
+    0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU,
+    0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U,
+    0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU,
+    0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U,
+    0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU,
+    0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U,
+    0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U,
+    0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU,
+    0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU,
+    0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU,
+    0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U,
+    0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U,
+    0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU,
+    0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U,
+    0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU,
+    0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U,
+    0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU,
+    0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U,
+    0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU,
+    0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU,
+    0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U,
+    0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU,
+    0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U,
+    0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU,
+    0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U,
+    0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U,
+    0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U,
+    0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU,
+    0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU,
+    0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U,
+    0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU,
+    0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U,
+};
+static const u32 Te2[256] = {
+    0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU,
+    0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U,
+    0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU,
+    0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U,
+    0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU,
+    0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U,
+    0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU,
+    0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U,
+    0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U,
+    0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU,
+    0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U,
+    0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U,
+    0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U,
+    0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU,
+    0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U,
+    0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U,
+    0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU,
+    0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U,
+    0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U,
+    0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U,
+    0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU,
+    0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU,
+    0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U,
+    0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU,
+    0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU,
+    0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U,
+    0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU,
+    0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U,
+    0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU,
+    0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U,
+    0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U,
+    0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U,
+    0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU,
+    0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U,
+    0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU,
+    0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U,
+    0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU,
+    0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U,
+    0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U,
+    0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU,
+    0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU,
+    0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU,
+    0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U,
+    0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U,
+    0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU,
+    0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U,
+    0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU,
+    0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U,
+    0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU,
+    0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U,
+    0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU,
+    0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU,
+    0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U,
+    0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU,
+    0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U,
+    0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU,
+    0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U,
+    0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U,
+    0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U,
+    0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU,
+    0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU,
+    0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U,
+    0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU,
+    0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U,
+};
+static const u32 Te3[256] = {
+
+    0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U,
+    0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U,
+    0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U,
+    0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU,
+    0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU,
+    0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU,
+    0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U,
+    0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU,
+    0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU,
+    0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U,
+    0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U,
+    0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU,
+    0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU,
+    0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU,
+    0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU,
+    0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU,
+    0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U,
+    0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU,
+    0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU,
+    0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U,
+    0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U,
+    0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U,
+    0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U,
+    0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U,
+    0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU,
+    0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U,
+    0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU,
+    0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU,
+    0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U,
+    0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U,
+    0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U,
+    0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU,
+    0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U,
+    0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU,
+    0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU,
+    0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U,
+    0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U,
+    0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU,
+    0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U,
+    0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU,
+    0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U,
+    0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U,
+    0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U,
+    0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U,
+    0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU,
+    0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U,
+    0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU,
+    0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U,
+    0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU,
+    0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U,
+    0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU,
+    0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU,
+    0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU,
+    0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU,
+    0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U,
+    0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U,
+    0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U,
+    0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U,
+    0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U,
+    0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U,
+    0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU,
+    0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U,
+    0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU,
+    0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU,
+};
+static const u32 Te4[256] = {
+    0x63636363U, 0x7c7c7c7cU, 0x77777777U, 0x7b7b7b7bU,
+    0xf2f2f2f2U, 0x6b6b6b6bU, 0x6f6f6f6fU, 0xc5c5c5c5U,
+    0x30303030U, 0x01010101U, 0x67676767U, 0x2b2b2b2bU,
+    0xfefefefeU, 0xd7d7d7d7U, 0xababababU, 0x76767676U,
+    0xcacacacaU, 0x82828282U, 0xc9c9c9c9U, 0x7d7d7d7dU,
+    0xfafafafaU, 0x59595959U, 0x47474747U, 0xf0f0f0f0U,
+    0xadadadadU, 0xd4d4d4d4U, 0xa2a2a2a2U, 0xafafafafU,
+    0x9c9c9c9cU, 0xa4a4a4a4U, 0x72727272U, 0xc0c0c0c0U,
+    0xb7b7b7b7U, 0xfdfdfdfdU, 0x93939393U, 0x26262626U,
+    0x36363636U, 0x3f3f3f3fU, 0xf7f7f7f7U, 0xccccccccU,
+    0x34343434U, 0xa5a5a5a5U, 0xe5e5e5e5U, 0xf1f1f1f1U,
+    0x71717171U, 0xd8d8d8d8U, 0x31313131U, 0x15151515U,
+    0x04040404U, 0xc7c7c7c7U, 0x23232323U, 0xc3c3c3c3U,
+    0x18181818U, 0x96969696U, 0x05050505U, 0x9a9a9a9aU,
+    0x07070707U, 0x12121212U, 0x80808080U, 0xe2e2e2e2U,
+    0xebebebebU, 0x27272727U, 0xb2b2b2b2U, 0x75757575U,
+    0x09090909U, 0x83838383U, 0x2c2c2c2cU, 0x1a1a1a1aU,
+    0x1b1b1b1bU, 0x6e6e6e6eU, 0x5a5a5a5aU, 0xa0a0a0a0U,
+    0x52525252U, 0x3b3b3b3bU, 0xd6d6d6d6U, 0xb3b3b3b3U,
+    0x29292929U, 0xe3e3e3e3U, 0x2f2f2f2fU, 0x84848484U,
+    0x53535353U, 0xd1d1d1d1U, 0x00000000U, 0xededededU,
+    0x20202020U, 0xfcfcfcfcU, 0xb1b1b1b1U, 0x5b5b5b5bU,
+    0x6a6a6a6aU, 0xcbcbcbcbU, 0xbebebebeU, 0x39393939U,
+    0x4a4a4a4aU, 0x4c4c4c4cU, 0x58585858U, 0xcfcfcfcfU,
+    0xd0d0d0d0U, 0xefefefefU, 0xaaaaaaaaU, 0xfbfbfbfbU,
+    0x43434343U, 0x4d4d4d4dU, 0x33333333U, 0x85858585U,
+    0x45454545U, 0xf9f9f9f9U, 0x02020202U, 0x7f7f7f7fU,
+    0x50505050U, 0x3c3c3c3cU, 0x9f9f9f9fU, 0xa8a8a8a8U,
+    0x51515151U, 0xa3a3a3a3U, 0x40404040U, 0x8f8f8f8fU,
+    0x92929292U, 0x9d9d9d9dU, 0x38383838U, 0xf5f5f5f5U,
+    0xbcbcbcbcU, 0xb6b6b6b6U, 0xdadadadaU, 0x21212121U,
+    0x10101010U, 0xffffffffU, 0xf3f3f3f3U, 0xd2d2d2d2U,
+    0xcdcdcdcdU, 0x0c0c0c0cU, 0x13131313U, 0xececececU,
+    0x5f5f5f5fU, 0x97979797U, 0x44444444U, 0x17171717U,
+    0xc4c4c4c4U, 0xa7a7a7a7U, 0x7e7e7e7eU, 0x3d3d3d3dU,
+    0x64646464U, 0x5d5d5d5dU, 0x19191919U, 0x73737373U,
+    0x60606060U, 0x81818181U, 0x4f4f4f4fU, 0xdcdcdcdcU,
+    0x22222222U, 0x2a2a2a2aU, 0x90909090U, 0x88888888U,
+    0x46464646U, 0xeeeeeeeeU, 0xb8b8b8b8U, 0x14141414U,
+    0xdedededeU, 0x5e5e5e5eU, 0x0b0b0b0bU, 0xdbdbdbdbU,
+    0xe0e0e0e0U, 0x32323232U, 0x3a3a3a3aU, 0x0a0a0a0aU,
+    0x49494949U, 0x06060606U, 0x24242424U, 0x5c5c5c5cU,
+    0xc2c2c2c2U, 0xd3d3d3d3U, 0xacacacacU, 0x62626262U,
+    0x91919191U, 0x95959595U, 0xe4e4e4e4U, 0x79797979U,
+    0xe7e7e7e7U, 0xc8c8c8c8U, 0x37373737U, 0x6d6d6d6dU,
+    0x8d8d8d8dU, 0xd5d5d5d5U, 0x4e4e4e4eU, 0xa9a9a9a9U,
+    0x6c6c6c6cU, 0x56565656U, 0xf4f4f4f4U, 0xeaeaeaeaU,
+    0x65656565U, 0x7a7a7a7aU, 0xaeaeaeaeU, 0x08080808U,
+    0xbabababaU, 0x78787878U, 0x25252525U, 0x2e2e2e2eU,
+    0x1c1c1c1cU, 0xa6a6a6a6U, 0xb4b4b4b4U, 0xc6c6c6c6U,
+    0xe8e8e8e8U, 0xddddddddU, 0x74747474U, 0x1f1f1f1fU,
+    0x4b4b4b4bU, 0xbdbdbdbdU, 0x8b8b8b8bU, 0x8a8a8a8aU,
+    0x70707070U, 0x3e3e3e3eU, 0xb5b5b5b5U, 0x66666666U,
+    0x48484848U, 0x03030303U, 0xf6f6f6f6U, 0x0e0e0e0eU,
+    0x61616161U, 0x35353535U, 0x57575757U, 0xb9b9b9b9U,
+    0x86868686U, 0xc1c1c1c1U, 0x1d1d1d1dU, 0x9e9e9e9eU,
+    0xe1e1e1e1U, 0xf8f8f8f8U, 0x98989898U, 0x11111111U,
+    0x69696969U, 0xd9d9d9d9U, 0x8e8e8e8eU, 0x94949494U,
+    0x9b9b9b9bU, 0x1e1e1e1eU, 0x87878787U, 0xe9e9e9e9U,
+    0xcecececeU, 0x55555555U, 0x28282828U, 0xdfdfdfdfU,
+    0x8c8c8c8cU, 0xa1a1a1a1U, 0x89898989U, 0x0d0d0d0dU,
+    0xbfbfbfbfU, 0xe6e6e6e6U, 0x42424242U, 0x68686868U,
+    0x41414141U, 0x99999999U, 0x2d2d2d2dU, 0x0f0f0f0fU,
+    0xb0b0b0b0U, 0x54545454U, 0xbbbbbbbbU, 0x16161616U,
+};
+static const u32 Td0[256] = {
+    0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U,
+    0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U,
+    0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U,
+    0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU,
+    0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U,
+    0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U,
+    0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU,
+    0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U,
+    0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU,
+    0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U,
+    0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U,
+    0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U,
+    0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U,
+    0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU,
+    0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U,
+    0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU,
+    0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U,
+    0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU,
+    0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U,
+    0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U,
+    0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U,
+    0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU,
+    0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U,
+    0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU,
+    0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U,
+    0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU,
+    0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U,
+    0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU,
+    0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU,
+    0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U,
+    0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU,
+    0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U,
+    0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU,
+    0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U,
+    0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U,
+    0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U,
+    0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU,
+    0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U,
+    0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U,
+    0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU,
+    0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U,
+    0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U,
+    0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U,
+    0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U,
+    0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U,
+    0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU,
+    0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U,
+    0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U,
+    0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U,
+    0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U,
+    0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U,
+    0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU,
+    0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU,
+    0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU,
+    0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU,
+    0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U,
+    0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U,
+    0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU,
+    0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU,
+    0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U,
+    0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU,
+    0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U,
+    0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U,
+    0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U,
+};
+static const u32 Td1[256] = {
+    0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU,
+    0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U,
+    0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU,
+    0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U,
+    0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U,
+    0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U,
+    0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U,
+    0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U,
+    0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U,
+    0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU,
+    0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU,
+    0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU,
+    0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U,
+    0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU,
+    0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U,
+    0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U,
+    0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U,
+    0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU,
+    0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU,
+    0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U,
+    0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU,
+    0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U,
+    0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU,
+    0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU,
+    0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U,
+    0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U,
+    0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U,
+    0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU,
+    0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U,
+    0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU,
+    0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U,
+    0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U,
+    0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U,
+    0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU,
+    0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U,
+    0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U,
+    0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U,
+    0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U,
+    0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U,
+    0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U,
+    0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU,
+    0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU,
+    0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U,
+    0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU,
+    0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U,
+    0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU,
+    0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU,
+    0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U,
+    0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU,
+    0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U,
+    0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U,
+    0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U,
+    0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U,
+    0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U,
+    0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U,
+    0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U,
+    0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU,
+    0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U,
+    0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U,
+    0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU,
+    0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U,
+    0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U,
+    0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U,
+    0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U,
+};
+static const u32 Td2[256] = {
+    0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U,
+    0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U,
+    0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U,
+    0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U,
+    0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU,
+    0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U,
+    0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U,
+    0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U,
+    0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U,
+    0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU,
+    0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U,
+    0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U,
+    0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU,
+    0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U,
+    0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U,
+    0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U,
+    0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U,
+    0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U,
+    0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U,
+    0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU,
+
+    0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U,
+    0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U,
+    0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U,
+    0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U,
+    0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U,
+    0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU,
+    0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU,
+    0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U,
+    0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU,
+    0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U,
+    0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU,
+    0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU,
+    0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU,
+    0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU,
+    0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U,
+    0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U,
+    0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U,
+    0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U,
+    0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U,
+    0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U,
+    0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U,
+    0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU,
+    0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU,
+    0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U,
+    0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U,
+    0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU,
+    0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU,
+    0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U,
+    0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U,
+    0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U,
+    0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U,
+    0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U,
+    0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U,
+    0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U,
+    0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU,
+    0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U,
+    0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U,
+    0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U,
+    0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U,
+    0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U,
+    0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U,
+    0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU,
+    0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U,
+    0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U,
+};
+static const u32 Td3[256] = {
+    0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU,
+    0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU,
+    0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U,
+    0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U,
+    0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU,
+    0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU,
+    0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U,
+    0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU,
+    0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U,
+    0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU,
+    0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U,
+    0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U,
+    0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U,
+    0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U,
+    0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U,
+    0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU,
+    0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU,
+    0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U,
+    0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U,
+    0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU,
+    0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU,
+    0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U,
+    0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U,
+    0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U,
+    0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U,
+    0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU,
+    0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U,
+    0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U,
+    0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU,
+    0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU,
+    0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U,
+    0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U,
+    0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U,
+    0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU,
+    0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U,
+    0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U,
+    0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U,
+    0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U,
+    0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U,
+    0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U,
+    0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U,
+    0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU,
+    0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U,
+    0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U,
+    0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU,
+    0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU,
+    0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U,
+    0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU,
+    0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U,
+    0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U,
+    0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U,
+    0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U,
+    0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U,
+    0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U,
+    0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU,
+    0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU,
+    0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU,
+    0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU,
+    0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U,
+    0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U,
+    0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U,
+    0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU,
+    0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U,
+    0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U,
+};
+static const u32 Td4[256] = {
+    0x52525252U, 0x09090909U, 0x6a6a6a6aU, 0xd5d5d5d5U,
+    0x30303030U, 0x36363636U, 0xa5a5a5a5U, 0x38383838U,
+    0xbfbfbfbfU, 0x40404040U, 0xa3a3a3a3U, 0x9e9e9e9eU,
+    0x81818181U, 0xf3f3f3f3U, 0xd7d7d7d7U, 0xfbfbfbfbU,
+    0x7c7c7c7cU, 0xe3e3e3e3U, 0x39393939U, 0x82828282U,
+    0x9b9b9b9bU, 0x2f2f2f2fU, 0xffffffffU, 0x87878787U,
+    0x34343434U, 0x8e8e8e8eU, 0x43434343U, 0x44444444U,
+    0xc4c4c4c4U, 0xdedededeU, 0xe9e9e9e9U, 0xcbcbcbcbU,
+    0x54545454U, 0x7b7b7b7bU, 0x94949494U, 0x32323232U,
+    0xa6a6a6a6U, 0xc2c2c2c2U, 0x23232323U, 0x3d3d3d3dU,
+    0xeeeeeeeeU, 0x4c4c4c4cU, 0x95959595U, 0x0b0b0b0bU,
+    0x42424242U, 0xfafafafaU, 0xc3c3c3c3U, 0x4e4e4e4eU,
+    0x08080808U, 0x2e2e2e2eU, 0xa1a1a1a1U, 0x66666666U,
+    0x28282828U, 0xd9d9d9d9U, 0x24242424U, 0xb2b2b2b2U,
+    0x76767676U, 0x5b5b5b5bU, 0xa2a2a2a2U, 0x49494949U,
+    0x6d6d6d6dU, 0x8b8b8b8bU, 0xd1d1d1d1U, 0x25252525U,
+    0x72727272U, 0xf8f8f8f8U, 0xf6f6f6f6U, 0x64646464U,
+    0x86868686U, 0x68686868U, 0x98989898U, 0x16161616U,
+    0xd4d4d4d4U, 0xa4a4a4a4U, 0x5c5c5c5cU, 0xccccccccU,
+    0x5d5d5d5dU, 0x65656565U, 0xb6b6b6b6U, 0x92929292U,
+    0x6c6c6c6cU, 0x70707070U, 0x48484848U, 0x50505050U,
+    0xfdfdfdfdU, 0xededededU, 0xb9b9b9b9U, 0xdadadadaU,
+    0x5e5e5e5eU, 0x15151515U, 0x46464646U, 0x57575757U,
+    0xa7a7a7a7U, 0x8d8d8d8dU, 0x9d9d9d9dU, 0x84848484U,
+    0x90909090U, 0xd8d8d8d8U, 0xababababU, 0x00000000U,
+    0x8c8c8c8cU, 0xbcbcbcbcU, 0xd3d3d3d3U, 0x0a0a0a0aU,
+    0xf7f7f7f7U, 0xe4e4e4e4U, 0x58585858U, 0x05050505U,
+    0xb8b8b8b8U, 0xb3b3b3b3U, 0x45454545U, 0x06060606U,
+    0xd0d0d0d0U, 0x2c2c2c2cU, 0x1e1e1e1eU, 0x8f8f8f8fU,
+    0xcacacacaU, 0x3f3f3f3fU, 0x0f0f0f0fU, 0x02020202U,
+    0xc1c1c1c1U, 0xafafafafU, 0xbdbdbdbdU, 0x03030303U,
+    0x01010101U, 0x13131313U, 0x8a8a8a8aU, 0x6b6b6b6bU,
+    0x3a3a3a3aU, 0x91919191U, 0x11111111U, 0x41414141U,
+    0x4f4f4f4fU, 0x67676767U, 0xdcdcdcdcU, 0xeaeaeaeaU,
+    0x97979797U, 0xf2f2f2f2U, 0xcfcfcfcfU, 0xcecececeU,
+    0xf0f0f0f0U, 0xb4b4b4b4U, 0xe6e6e6e6U, 0x73737373U,
+    0x96969696U, 0xacacacacU, 0x74747474U, 0x22222222U,
+    0xe7e7e7e7U, 0xadadadadU, 0x35353535U, 0x85858585U,
+    0xe2e2e2e2U, 0xf9f9f9f9U, 0x37373737U, 0xe8e8e8e8U,
+    0x1c1c1c1cU, 0x75757575U, 0xdfdfdfdfU, 0x6e6e6e6eU,
+    0x47474747U, 0xf1f1f1f1U, 0x1a1a1a1aU, 0x71717171U,
+    0x1d1d1d1dU, 0x29292929U, 0xc5c5c5c5U, 0x89898989U,
+    0x6f6f6f6fU, 0xb7b7b7b7U, 0x62626262U, 0x0e0e0e0eU,
+    0xaaaaaaaaU, 0x18181818U, 0xbebebebeU, 0x1b1b1b1bU,
+    0xfcfcfcfcU, 0x56565656U, 0x3e3e3e3eU, 0x4b4b4b4bU,
+    0xc6c6c6c6U, 0xd2d2d2d2U, 0x79797979U, 0x20202020U,
+    0x9a9a9a9aU, 0xdbdbdbdbU, 0xc0c0c0c0U, 0xfefefefeU,
+    0x78787878U, 0xcdcdcdcdU, 0x5a5a5a5aU, 0xf4f4f4f4U,
+    0x1f1f1f1fU, 0xddddddddU, 0xa8a8a8a8U, 0x33333333U,
+    0x88888888U, 0x07070707U, 0xc7c7c7c7U, 0x31313131U,
+    0xb1b1b1b1U, 0x12121212U, 0x10101010U, 0x59595959U,
+    0x27272727U, 0x80808080U, 0xececececU, 0x5f5f5f5fU,
+    0x60606060U, 0x51515151U, 0x7f7f7f7fU, 0xa9a9a9a9U,
+    0x19191919U, 0xb5b5b5b5U, 0x4a4a4a4aU, 0x0d0d0d0dU,
+    0x2d2d2d2dU, 0xe5e5e5e5U, 0x7a7a7a7aU, 0x9f9f9f9fU,
+    0x93939393U, 0xc9c9c9c9U, 0x9c9c9c9cU, 0xefefefefU,
+    0xa0a0a0a0U, 0xe0e0e0e0U, 0x3b3b3b3bU, 0x4d4d4d4dU,
+    0xaeaeaeaeU, 0x2a2a2a2aU, 0xf5f5f5f5U, 0xb0b0b0b0U,
+    0xc8c8c8c8U, 0xebebebebU, 0xbbbbbbbbU, 0x3c3c3c3cU,
+    0x83838383U, 0x53535353U, 0x99999999U, 0x61616161U,
+    0x17171717U, 0x2b2b2b2bU, 0x04040404U, 0x7e7e7e7eU,
+    0xbabababaU, 0x77777777U, 0xd6d6d6d6U, 0x26262626U,
+    0xe1e1e1e1U, 0x69696969U, 0x14141414U, 0x63636363U,
+    0x55555555U, 0x21212121U, 0x0c0c0c0cU, 0x7d7d7d7dU,
+};
+static const u32 rcon[] = {
+       0x01000000, 0x02000000, 0x04000000, 0x08000000,
+       0x10000000, 0x20000000, 0x40000000, 0x80000000,
+       0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more 
than 10 rcon values */
+};
+
+/**
+ * Expand the cipher key into the encryption key schedule.
+ */
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+                       AES_KEY *key) {
+
+       u32 *rk;
+       int i = 0;
+       u32 temp;
+
+       if (!userKey || !key)
+               return -1;
+       if (bits != 128 && bits != 192 && bits != 256)
+               return -2;
+
+       rk = key->rd_key;
+
+       if (bits==128)
+               key->rounds = 10;
+       else if (bits==192)
+               key->rounds = 12;
+       else
+               key->rounds = 14;
+
+       rk[0] = GETU32(userKey     );
+       rk[1] = GETU32(userKey +  4);
+       rk[2] = GETU32(userKey +  8);
+       rk[3] = GETU32(userKey + 12);
+       if (bits == 128) {
+               while (1) {
+                       temp  = rk[3];
+                       rk[4] = rk[0] ^
+                               (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+                               (Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^
+                               (Te4[(temp      ) & 0xff] & 0x0000ff00) ^
+                               (Te4[(temp >> 24)       ] & 0x000000ff) ^
+                               rcon[i];
+                       rk[5] = rk[1] ^ rk[4];
+                       rk[6] = rk[2] ^ rk[5];
+                       rk[7] = rk[3] ^ rk[6];
+                       if (++i == 10) {
+                               return 0;
+                       }
+                       rk += 4;
+               }
+       }
+       rk[4] = GETU32(userKey + 16);
+       rk[5] = GETU32(userKey + 20);
+       if (bits == 192) {
+               while (1) {
+                       temp = rk[ 5];
+                       rk[ 6] = rk[ 0] ^
+                               (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+                               (Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^
+                               (Te4[(temp      ) & 0xff] & 0x0000ff00) ^
+                               (Te4[(temp >> 24)       ] & 0x000000ff) ^
+                               rcon[i];
+                       rk[ 7] = rk[ 1] ^ rk[ 6];
+                       rk[ 8] = rk[ 2] ^ rk[ 7];
+                       rk[ 9] = rk[ 3] ^ rk[ 8];
+                       if (++i == 8) {
+                               return 0;
+                       }
+                       rk[10] = rk[ 4] ^ rk[ 9];
+                       rk[11] = rk[ 5] ^ rk[10];
+                       rk += 6;
+               }
+       }
+       rk[6] = GETU32(userKey + 24);
+       rk[7] = GETU32(userKey + 28);
+       if (bits == 256) {
+               while (1) {
+                       temp = rk[ 7];
+                       rk[ 8] = rk[ 0] ^
+                               (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+                               (Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^
+                               (Te4[(temp      ) & 0xff] & 0x0000ff00) ^
+                               (Te4[(temp >> 24)       ] & 0x000000ff) ^
+                               rcon[i];
+                       rk[ 9] = rk[ 1] ^ rk[ 8];
+                       rk[10] = rk[ 2] ^ rk[ 9];
+                       rk[11] = rk[ 3] ^ rk[10];
+                       if (++i == 7) {
+                               return 0;
+                       }
+                       temp = rk[11];
+                       rk[12] = rk[ 4] ^
+                               (Te4[(temp >> 24)       ] & 0xff000000) ^
+                               (Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^
+                               (Te4[(temp >>  8) & 0xff] & 0x0000ff00) ^
+                               (Te4[(temp      ) & 0xff] & 0x000000ff);
+                       rk[13] = rk[ 5] ^ rk[12];
+                       rk[14] = rk[ 6] ^ rk[13];
+                       rk[15] = rk[ 7] ^ rk[14];
+
+                       rk += 8;
+               }
+       }
+       return 0;
+}
+
+/**
+ * Expand the cipher key into the decryption key schedule.
+ */
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+                        AES_KEY *key) {
+
+        u32 *rk;
+       int i, j, status;
+       u32 temp;
+
+       /* first, start with an encryption schedule */
+       status = AES_set_encrypt_key(userKey, bits, key);
+       if (status < 0)
+               return status;
+
+       rk = key->rd_key;
+
+       /* invert the order of the round keys: */
+       for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
+               temp = rk[i    ]; rk[i    ] = rk[j    ]; rk[j    ] = temp;
+               temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
+               temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
+               temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
+       }
+       /* apply the inverse MixColumn transform to all round keys but the 
first and the last: */
+       for (i = 1; i < (key->rounds); i++) {
+               rk += 4;
+               rk[0] =
+                       Td0[Te4[(rk[0] >> 24)       ] & 0xff] ^
+                       Td1[Te4[(rk[0] >> 16) & 0xff] & 0xff] ^
+                       Td2[Te4[(rk[0] >>  8) & 0xff] & 0xff] ^
+                       Td3[Te4[(rk[0]      ) & 0xff] & 0xff];
+               rk[1] =
+                       Td0[Te4[(rk[1] >> 24)       ] & 0xff] ^
+                       Td1[Te4[(rk[1] >> 16) & 0xff] & 0xff] ^
+                       Td2[Te4[(rk[1] >>  8) & 0xff] & 0xff] ^
+                       Td3[Te4[(rk[1]      ) & 0xff] & 0xff];
+               rk[2] =
+                       Td0[Te4[(rk[2] >> 24)       ] & 0xff] ^
+                       Td1[Te4[(rk[2] >> 16) & 0xff] & 0xff] ^
+                       Td2[Te4[(rk[2] >>  8) & 0xff] & 0xff] ^
+                       Td3[Te4[(rk[2]      ) & 0xff] & 0xff];
+               rk[3] =
+                       Td0[Te4[(rk[3] >> 24)       ] & 0xff] ^
+                       Td1[Te4[(rk[3] >> 16) & 0xff] & 0xff] ^
+                       Td2[Te4[(rk[3] >>  8) & 0xff] & 0xff] ^
+                       Td3[Te4[(rk[3]      ) & 0xff] & 0xff];
+       }
+       return 0;
+}
+
+#ifndef AES_ASM
+/*
+ * Encrypt a single block
+ * in and out can overlap
+ */
+void AES_encrypt(const unsigned char *in, unsigned char *out,
+                const AES_KEY *key) {
+
+       const u32 *rk;
+       u32 s0, s1, s2, s3, t0, t1, t2, t3;
+#ifndef FULL_UNROLL
+       int r;
+#endif /* ?FULL_UNROLL */
+
+       assert(in && out && key);
+       rk = key->rd_key;
+
+       /*
+        * map byte array block to cipher state
+        * and add initial round key:
+        */
+       s0 = GETU32(in     ) ^ rk[0];
+       s1 = GETU32(in +  4) ^ rk[1];
+       s2 = GETU32(in +  8) ^ rk[2];
+       s3 = GETU32(in + 12) ^ rk[3];
+#ifdef FULL_UNROLL
+       /* round 1: */
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ 
Te3[s3 & 0xff] ^ rk[ 4];
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ 
Te3[s0 & 0xff] ^ rk[ 5];
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ 
Te3[s1 & 0xff] ^ rk[ 6];
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ 
Te3[s2 & 0xff] ^ rk[ 7];
+       /* round 2: */
+       s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ 
Te3[t3 & 0xff] ^ rk[ 8];
+       s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ 
Te3[t0 & 0xff] ^ rk[ 9];
+       s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ 
Te3[t1 & 0xff] ^ rk[10];
+       s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ 
Te3[t2 & 0xff] ^ rk[11];
+       /* round 3: */
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ 
Te3[s3 & 0xff] ^ rk[12];
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ 
Te3[s0 & 0xff] ^ rk[13];
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ 
Te3[s1 & 0xff] ^ rk[14];
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ 
Te3[s2 & 0xff] ^ rk[15];
+       /* round 4: */
+       s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ 
Te3[t3 & 0xff] ^ rk[16];
+       s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ 
Te3[t0 & 0xff] ^ rk[17];
+       s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ 
Te3[t1 & 0xff] ^ rk[18];
+       s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ 
Te3[t2 & 0xff] ^ rk[19];
+       /* round 5: */
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ 
Te3[s3 & 0xff] ^ rk[20];
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ 
Te3[s0 & 0xff] ^ rk[21];
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ 
Te3[s1 & 0xff] ^ rk[22];
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ 
Te3[s2 & 0xff] ^ rk[23];
+       /* round 6: */
+       s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ 
Te3[t3 & 0xff] ^ rk[24];
+       s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ 
Te3[t0 & 0xff] ^ rk[25];
+       s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ 
Te3[t1 & 0xff] ^ rk[26];
+       s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ 
Te3[t2 & 0xff] ^ rk[27];
+       /* round 7: */
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ 
Te3[s3 & 0xff] ^ rk[28];
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ 
Te3[s0 & 0xff] ^ rk[29];
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ 
Te3[s1 & 0xff] ^ rk[30];
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ 
Te3[s2 & 0xff] ^ rk[31];
+       /* round 8: */
+       s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ 
Te3[t3 & 0xff] ^ rk[32];
+       s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ 
Te3[t0 & 0xff] ^ rk[33];
+       s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ 
Te3[t1 & 0xff] ^ rk[34];
+       s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ 
Te3[t2 & 0xff] ^ rk[35];
+       /* round 9: */
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ 
Te3[s3 & 0xff] ^ rk[36];
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ 
Te3[s0 & 0xff] ^ rk[37];
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ 
Te3[s1 & 0xff] ^ rk[38];
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ 
Te3[s2 & 0xff] ^ rk[39];
+    if (key->rounds > 10) {
+        /* round 10: */
+        s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ 
Te3[t3 & 0xff] ^ rk[40];
+        s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ 
Te3[t0 & 0xff] ^ rk[41];
+        s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ 
Te3[t1 & 0xff] ^ rk[42];
+        s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ 
Te3[t2 & 0xff] ^ rk[43];
+        /* round 11: */
+        t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ 
Te3[s3 & 0xff] ^ rk[44];
+        t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ 
Te3[s0 & 0xff] ^ rk[45];
+        t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ 
Te3[s1 & 0xff] ^ rk[46];
+        t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ 
Te3[s2 & 0xff] ^ rk[47];
+        if (key->rounds > 12) {
+            /* round 12: */
+            s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 
0xff] ^ Te3[t3 & 0xff] ^ rk[48];
+            s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 
0xff] ^ Te3[t0 & 0xff] ^ rk[49];
+            s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 
0xff] ^ Te3[t1 & 0xff] ^ rk[50];
+            s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 
0xff] ^ Te3[t2 & 0xff] ^ rk[51];
+            /* round 13: */
+            t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 
0xff] ^ Te3[s3 & 0xff] ^ rk[52];
+            t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 
0xff] ^ Te3[s0 & 0xff] ^ rk[53];
+            t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 
0xff] ^ Te3[s1 & 0xff] ^ rk[54];
+            t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 
0xff] ^ Te3[s2 & 0xff] ^ rk[55];
+        }
+    }
+    rk += key->rounds << 2;
+#else  /* !FULL_UNROLL */
+    /*
+     * Nr - 1 full rounds:
+     */
+    r = key->rounds >> 1;
+    for (;;) {
+        t0 =
+            Te0[(s0 >> 24)       ] ^
+            Te1[(s1 >> 16) & 0xff] ^
+            Te2[(s2 >>  8) & 0xff] ^
+            Te3[(s3      ) & 0xff] ^
+            rk[4];
+        t1 =
+            Te0[(s1 >> 24)       ] ^
+            Te1[(s2 >> 16) & 0xff] ^
+            Te2[(s3 >>  8) & 0xff] ^
+            Te3[(s0      ) & 0xff] ^
+            rk[5];
+        t2 =
+            Te0[(s2 >> 24)       ] ^
+            Te1[(s3 >> 16) & 0xff] ^
+            Te2[(s0 >>  8) & 0xff] ^
+            Te3[(s1      ) & 0xff] ^
+            rk[6];
+        t3 =
+            Te0[(s3 >> 24)       ] ^
+            Te1[(s0 >> 16) & 0xff] ^
+            Te2[(s1 >>  8) & 0xff] ^
+            Te3[(s2      ) & 0xff] ^
+            rk[7];
+
+        rk += 8;
+        if (--r == 0) {
+            break;
+        }
+
+        s0 =
+            Te0[(t0 >> 24)       ] ^
+            Te1[(t1 >> 16) & 0xff] ^
+            Te2[(t2 >>  8) & 0xff] ^
+            Te3[(t3      ) & 0xff] ^
+            rk[0];
+        s1 =
+            Te0[(t1 >> 24)       ] ^
+            Te1[(t2 >> 16) & 0xff] ^
+            Te2[(t3 >>  8) & 0xff] ^
+            Te3[(t0      ) & 0xff] ^
+            rk[1];
+        s2 =
+            Te0[(t2 >> 24)       ] ^
+            Te1[(t3 >> 16) & 0xff] ^
+            Te2[(t0 >>  8) & 0xff] ^
+            Te3[(t1      ) & 0xff] ^
+            rk[2];
+        s3 =
+            Te0[(t3 >> 24)       ] ^
+            Te1[(t0 >> 16) & 0xff] ^
+            Te2[(t1 >>  8) & 0xff] ^
+            Te3[(t2      ) & 0xff] ^
+            rk[3];
+    }
+#endif /* ?FULL_UNROLL */
+    /*
+        * apply last round and
+        * map cipher state to byte array block:
+        */
+       s0 =
+               (Te4[(t0 >> 24)       ] & 0xff000000) ^
+               (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
+               (Te4[(t2 >>  8) & 0xff] & 0x0000ff00) ^
+               (Te4[(t3      ) & 0xff] & 0x000000ff) ^
+               rk[0];
+       PUTU32(out     , s0);
+       s1 =
+               (Te4[(t1 >> 24)       ] & 0xff000000) ^
+               (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
+               (Te4[(t3 >>  8) & 0xff] & 0x0000ff00) ^
+               (Te4[(t0      ) & 0xff] & 0x000000ff) ^
+               rk[1];
+       PUTU32(out +  4, s1);
+       s2 =
+               (Te4[(t2 >> 24)       ] & 0xff000000) ^
+               (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
+               (Te4[(t0 >>  8) & 0xff] & 0x0000ff00) ^
+               (Te4[(t1      ) & 0xff] & 0x000000ff) ^
+               rk[2];
+       PUTU32(out +  8, s2);
+       s3 =
+               (Te4[(t3 >> 24)       ] & 0xff000000) ^
+               (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
+               (Te4[(t1 >>  8) & 0xff] & 0x0000ff00) ^
+               (Te4[(t2      ) & 0xff] & 0x000000ff) ^
+               rk[3];
+       PUTU32(out + 12, s3);
+}
+
+/*
+ * Decrypt a single block
+ * in and out can overlap
+ */
+void AES_decrypt(const unsigned char *in, unsigned char *out,
+                const AES_KEY *key) {
+
+       const u32 *rk;
+       u32 s0, s1, s2, s3, t0, t1, t2, t3;
+#ifndef FULL_UNROLL
+       int r;
+#endif /* ?FULL_UNROLL */
+
+       assert(in && out && key);
+       rk = key->rd_key;
+
+       /*
+        * map byte array block to cipher state
+        * and add initial round key:
+        */
+    s0 = GETU32(in     ) ^ rk[0];
+    s1 = GETU32(in +  4) ^ rk[1];
+    s2 = GETU32(in +  8) ^ rk[2];
+    s3 = GETU32(in + 12) ^ rk[3];
+#ifdef FULL_UNROLL
+    /* round 1: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ 
Td3[s1 & 0xff] ^ rk[ 4];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ 
Td3[s2 & 0xff] ^ rk[ 5];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ 
Td3[s3 & 0xff] ^ rk[ 6];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ 
Td3[s0 & 0xff] ^ rk[ 7];
+    /* round 2: */
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ 
Td3[t1 & 0xff] ^ rk[ 8];
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ 
Td3[t2 & 0xff] ^ rk[ 9];
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ 
Td3[t3 & 0xff] ^ rk[10];
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ 
Td3[t0 & 0xff] ^ rk[11];
+    /* round 3: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ 
Td3[s1 & 0xff] ^ rk[12];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ 
Td3[s2 & 0xff] ^ rk[13];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ 
Td3[s3 & 0xff] ^ rk[14];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ 
Td3[s0 & 0xff] ^ rk[15];
+    /* round 4: */
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ 
Td3[t1 & 0xff] ^ rk[16];
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ 
Td3[t2 & 0xff] ^ rk[17];
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ 
Td3[t3 & 0xff] ^ rk[18];
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ 
Td3[t0 & 0xff] ^ rk[19];
+    /* round 5: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ 
Td3[s1 & 0xff] ^ rk[20];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ 
Td3[s2 & 0xff] ^ rk[21];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ 
Td3[s3 & 0xff] ^ rk[22];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ 
Td3[s0 & 0xff] ^ rk[23];
+    /* round 6: */
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ 
Td3[t1 & 0xff] ^ rk[24];
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ 
Td3[t2 & 0xff] ^ rk[25];
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ 
Td3[t3 & 0xff] ^ rk[26];
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ 
Td3[t0 & 0xff] ^ rk[27];
+    /* round 7: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ 
Td3[s1 & 0xff] ^ rk[28];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ 
Td3[s2 & 0xff] ^ rk[29];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ 
Td3[s3 & 0xff] ^ rk[30];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ 
Td3[s0 & 0xff] ^ rk[31];
+    /* round 8: */
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ 
Td3[t1 & 0xff] ^ rk[32];
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ 
Td3[t2 & 0xff] ^ rk[33];
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ 
Td3[t3 & 0xff] ^ rk[34];
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ 
Td3[t0 & 0xff] ^ rk[35];
+    /* round 9: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ 
Td3[s1 & 0xff] ^ rk[36];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ 
Td3[s2 & 0xff] ^ rk[37];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ 
Td3[s3 & 0xff] ^ rk[38];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ 
Td3[s0 & 0xff] ^ rk[39];
+    if (key->rounds > 10) {
+        /* round 10: */
+        s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ 
Td3[t1 & 0xff] ^ rk[40];
+        s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ 
Td3[t2 & 0xff] ^ rk[41];
+        s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ 
Td3[t3 & 0xff] ^ rk[42];
+        s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ 
Td3[t0 & 0xff] ^ rk[43];
+        /* round 11: */
+        t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ 
Td3[s1 & 0xff] ^ rk[44];
+        t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ 
Td3[s2 & 0xff] ^ rk[45];
+        t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ 
Td3[s3 & 0xff] ^ rk[46];
+        t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ 
Td3[s0 & 0xff] ^ rk[47];
+        if (key->rounds > 12) {
+            /* round 12: */
+            s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 
0xff] ^ Td3[t1 & 0xff] ^ rk[48];
+            s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 
0xff] ^ Td3[t2 & 0xff] ^ rk[49];
+            s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 
0xff] ^ Td3[t3 & 0xff] ^ rk[50];
+            s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 
0xff] ^ Td3[t0 & 0xff] ^ rk[51];
+            /* round 13: */
+            t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 
0xff] ^ Td3[s1 & 0xff] ^ rk[52];
+            t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 
0xff] ^ Td3[s2 & 0xff] ^ rk[53];
+            t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 
0xff] ^ Td3[s3 & 0xff] ^ rk[54];
+            t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 
0xff] ^ Td3[s0 & 0xff] ^ rk[55];
+        }
+    }
+       rk += key->rounds << 2;
+#else  /* !FULL_UNROLL */
+    /*
+     * Nr - 1 full rounds:
+     */
+    r = key->rounds >> 1;
+    for (;;) {
+        t0 =
+            Td0[(s0 >> 24)       ] ^
+            Td1[(s3 >> 16) & 0xff] ^
+            Td2[(s2 >>  8) & 0xff] ^
+            Td3[(s1      ) & 0xff] ^
+            rk[4];
+        t1 =
+            Td0[(s1 >> 24)       ] ^
+            Td1[(s0 >> 16) & 0xff] ^
+            Td2[(s3 >>  8) & 0xff] ^
+            Td3[(s2      ) & 0xff] ^
+            rk[5];
+        t2 =
+            Td0[(s2 >> 24)       ] ^
+            Td1[(s1 >> 16) & 0xff] ^
+            Td2[(s0 >>  8) & 0xff] ^
+            Td3[(s3      ) & 0xff] ^
+            rk[6];
+        t3 =
+            Td0[(s3 >> 24)       ] ^
+            Td1[(s2 >> 16) & 0xff] ^
+            Td2[(s1 >>  8) & 0xff] ^
+            Td3[(s0      ) & 0xff] ^
+            rk[7];
+
+        rk += 8;
+        if (--r == 0) {
+            break;
+        }
+
+        s0 =
+            Td0[(t0 >> 24)       ] ^
+            Td1[(t3 >> 16) & 0xff] ^
+            Td2[(t2 >>  8) & 0xff] ^
+            Td3[(t1      ) & 0xff] ^
+            rk[0];
+        s1 =
+            Td0[(t1 >> 24)       ] ^
+            Td1[(t0 >> 16) & 0xff] ^
+            Td2[(t3 >>  8) & 0xff] ^
+            Td3[(t2      ) & 0xff] ^
+            rk[1];
+        s2 =
+            Td0[(t2 >> 24)       ] ^
+            Td1[(t1 >> 16) & 0xff] ^
+            Td2[(t0 >>  8) & 0xff] ^
+            Td3[(t3      ) & 0xff] ^
+            rk[2];
+        s3 =
+            Td0[(t3 >> 24)       ] ^
+            Td1[(t2 >> 16) & 0xff] ^
+            Td2[(t1 >>  8) & 0xff] ^
+            Td3[(t0      ) & 0xff] ^
+            rk[3];
+    }
+#endif /* ?FULL_UNROLL */
+    /*
+        * apply last round and
+        * map cipher state to byte array block:
+        */
+       s0 =
+               (Td4[(t0 >> 24)       ] & 0xff000000) ^
+               (Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
+               (Td4[(t2 >>  8) & 0xff] & 0x0000ff00) ^
+               (Td4[(t1      ) & 0xff] & 0x000000ff) ^
+               rk[0];
+       PUTU32(out     , s0);
+       s1 =
+               (Td4[(t1 >> 24)       ] & 0xff000000) ^
+               (Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
+               (Td4[(t3 >>  8) & 0xff] & 0x0000ff00) ^
+               (Td4[(t2      ) & 0xff] & 0x000000ff) ^
+               rk[1];
+       PUTU32(out +  4, s1);
+       s2 =
+               (Td4[(t2 >> 24)       ] & 0xff000000) ^
+               (Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
+               (Td4[(t0 >>  8) & 0xff] & 0x0000ff00) ^
+               (Td4[(t3      ) & 0xff] & 0x000000ff) ^
+               rk[2];
+       PUTU32(out +  8, s2);
+       s3 =
+               (Td4[(t3 >> 24)       ] & 0xff000000) ^
+               (Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
+               (Td4[(t1 >>  8) & 0xff] & 0x0000ff00) ^
+               (Td4[(t0      ) & 0xff] & 0x000000ff) ^
+               rk[3];
+       PUTU32(out + 12, s3);
+}
+
+#endif /* AES_ASM */
+
+void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+                    const unsigned long length, const AES_KEY *key,
+                    unsigned char *ivec, const int enc) 
+{
+
+       unsigned long n;
+       unsigned long len = length;
+       unsigned char tmp[AES_BLOCK_SIZE];
+
+       assert(in && out && key && ivec);
+
+       if (enc) {
+               while (len >= AES_BLOCK_SIZE) {
+                       for(n=0; n < AES_BLOCK_SIZE; ++n)
+                               tmp[n] = in[n] ^ ivec[n];
+                       AES_encrypt(tmp, out, key);
+                       memcpy(ivec, out, AES_BLOCK_SIZE);
+                       len -= AES_BLOCK_SIZE;
+                       in += AES_BLOCK_SIZE;
+                       out += AES_BLOCK_SIZE;
+               }
+               if (len) {
+                       for(n=0; n < len; ++n)
+                               tmp[n] = in[n] ^ ivec[n];
+                       for(n=len; n < AES_BLOCK_SIZE; ++n)
+                               tmp[n] = ivec[n];
+                       AES_encrypt(tmp, tmp, key);
+                       memcpy(out, tmp, AES_BLOCK_SIZE);
+                       memcpy(ivec, tmp, AES_BLOCK_SIZE);
+               }                       
+       } else {
+               while (len >= AES_BLOCK_SIZE) {
+                       memcpy(tmp, in, AES_BLOCK_SIZE);
+                       AES_decrypt(in, out, key);
+                       for(n=0; n < AES_BLOCK_SIZE; ++n)
+                               out[n] ^= ivec[n];
+                       memcpy(ivec, tmp, AES_BLOCK_SIZE);
+                       len -= AES_BLOCK_SIZE;
+                       in += AES_BLOCK_SIZE;
+                       out += AES_BLOCK_SIZE;
+               }
+               if (len) {
+                       memcpy(tmp, in, AES_BLOCK_SIZE);
+                       AES_decrypt(tmp, tmp, key);
+                       for(n=0; n < len; ++n)
+                               out[n] = tmp[n] ^ ivec[n];
+                       memcpy(ivec, tmp, AES_BLOCK_SIZE);
+               }                       
+       }
+}
diff -r f210a633571c -r 1c627434605e tools/blktap2/drivers/aes.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap2/drivers/aes.h       Tue May 26 11:52:31 2009 +0100
@@ -0,0 +1,28 @@
+#ifndef QEMU_AES_H
+#define QEMU_AES_H
+
+#include <stdint.h>
+
+#define AES_MAXNR 14
+#define AES_BLOCK_SIZE 16
+
+struct aes_key_st {
+    uint32_t rd_key[4 *(AES_MAXNR + 1)];
+    int rounds;
+};
+typedef struct aes_key_st AES_KEY;
+
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+       AES_KEY *key);
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+       AES_KEY *key);
+
+void AES_encrypt(const unsigned char *in, unsigned char *out,
+       const AES_KEY *key);
+void AES_decrypt(const unsigned char *in, unsigned char *out,
+       const AES_KEY *key);
+void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+                    const unsigned long length, const AES_KEY *key,
+                    unsigned char *ivec, const int enc);
+
+#endif
diff -r f210a633571c -r 1c627434605e tools/blktap2/drivers/atomicio.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap2/drivers/atomicio.c  Tue May 26 11:52:31 2009 +0100
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2005 Anil Madhavapeddy. All rights reserved.
+ * Copyright (c) 1995,1999 Theo de Raadt.  All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+#include "atomicio.h"
+
+/*
+ * ensure all of data on socket comes through. f==read || f==vwrite
+ */
+size_t
+atomicio(f, fd, _s, n)
+       ssize_t (*f) (int, void *, size_t);
+       int fd;
+       void *_s;
+       size_t n;
+{
+       char *s = _s;
+       size_t pos = 0;
+       ssize_t res;
+
+       while (n > pos) {
+               res = (f) (fd, s + pos, n - pos);
+               switch (res) {
+               case -1:
+                       if (errno == EINTR || errno == EAGAIN)
+                               continue;
+                       return 0;
+               case 0:
+                       errno = EPIPE;
+                       return pos;
+               default:
+                       pos += (size_t)res;
+               }
+       }
+       return (pos);
+}
+
diff -r f210a633571c -r 1c627434605e tools/blktap2/drivers/blk.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap2/drivers/blk.h       Tue May 26 11:52:31 2009 +0100
@@ -0,0 +1,30 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+int blk_getimagesize(int fd, uint64_t *size);
+int blk_getsectorsize(int fd, uint64_t *sector_size);
diff -r f210a633571c -r 1c627434605e tools/blktap2/drivers/blk_linux.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap2/drivers/blk_linux.c Tue May 26 11:52:31 2009 +0100
@@ -0,0 +1,43 @@
+#include <inttypes.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <linux/errno.h>
+#include "tapdisk.h"
+#include "blk.h"
+
+int blk_getimagesize(int fd, uint64_t *size)
+{
+       int rc;
+
+       *size = 0;
+       rc = ioctl(fd, BLKGETSIZE, size);
+       if (rc) {
+               DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+int blk_getsectorsize(int fd, uint64_t *sector_size)
+{
+#if defined(BLKSSZGET)
+       int rc;
+
+       *sector_size = DEFAULT_SECTOR_SIZE;
+       rc = ioctl(fd, BLKSSZGET, sector_size);
+       if (rc) {
+               DPRINTF("ERR: BLKSSZGET failed. Falling back to use default 
sector size");
+               *sector_size = DEFAULT_SECTOR_SIZE;
+       }
+
+       if (*sector_size != DEFAULT_SECTOR_SIZE)
+               DPRINTF("Note: sector size is %"PRIu64" (not %u)\n",
+                       *sector_size, DEFAULT_SECTOR_SIZE);
+#else
+       *sector_size = DEFAULT_SECTOR_SIZE;
+#endif
+
+       return 0;
+}
+
diff -r f210a633571c -r 1c627434605e tools/blktap2/drivers/blktap2.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap2/drivers/blktap2.h   Tue May 26 11:52:31 2009 +0100
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _BLKTAP_2_H_
+#define _BLKTAP_2_H_
+
+#define MISC_MAJOR_NUMBER              10
+
+#define BLKTAP2_MAX_MESSAGE_LEN        256
+
+#define BLKTAP2_RING_MESSAGE_PAUSE     1
+#define BLKTAP2_RING_MESSAGE_RESUME    2
+#define BLKTAP2_RING_MESSAGE_CLOSE     3
+
+#define BLKTAP2_IOCTL_KICK_FE          1
+#define BLKTAP2_IOCTL_ALLOC_TAP        200
+#define BLKTAP2_IOCTL_FREE_TAP         201
+#define BLKTAP2_IOCTL_CREATE_DEVICE    202
+#define BLKTAP2_IOCTL_SET_PARAMS       203
+#define BLKTAP2_IOCTL_PAUSE            204
+#define BLKTAP2_IOCTL_REOPEN           205
+#define BLKTAP2_IOCTL_RESUME           206
+
+#define BLKTAP2_CONTROL_NAME           "blktap-control"
+#define BLKTAP2_DIRECTORY              "/dev/xen/blktap-2"
+#define BLKTAP2_CONTROL_DEVICE         BLKTAP2_DIRECTORY"/control"
+#define BLKTAP2_RING_DEVICE            BLKTAP2_DIRECTORY"/blktap"
+#define BLKTAP2_IO_DEVICE              BLKTAP2_DIRECTORY"/tapdev"
+
+struct blktap2_handle {
+       unsigned int                   ring;
+       unsigned int                   device;
+       unsigned int                   minor;
+};
+
+struct blktap2_params {
+       char                           name[BLKTAP2_MAX_MESSAGE_LEN];
+       unsigned long long             capacity;
+       unsigned long                  sector_size;
+};
+
+#endif
diff -r f210a633571c -r 1c627434605e tools/blktap2/drivers/block-aio.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap2/drivers/block-aio.c Tue May 26 11:52:31 2009 +0100
@@ -0,0 +1,272 @@
+/* 
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <errno.h>
+#include <libaio.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+
+#include "tapdisk.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+
+#define MAX_AIO_REQS         TAPDISK_DATA_REQUESTS
+
+struct tdaio_state;
+
+struct aio_request {
+       td_request_t         treq;
+       struct tiocb         tiocb;
+       struct tdaio_state  *state;
+};
+
+struct tdaio_state {
+       int                  fd;
+       td_driver_t         *driver;
+
+       int                  aio_free_count;    
+       struct aio_request   aio_requests[MAX_AIO_REQS];
+       struct aio_request  *aio_free_list[MAX_AIO_REQS];
+};
+
+/*Get Image size, secsize*/
+static int tdaio_get_image_info(int fd, td_disk_info_t *info)
+{
+       int ret;
+       long size;
+       unsigned long total_size;
+       struct statvfs statBuf;
+       struct stat stat;
+
+       ret = fstat(fd, &stat);
+       if (ret != 0) {
+               DPRINTF("ERROR: fstat failed, Couldn't stat image");
+               return -EINVAL;
+       }
+
+       if (S_ISBLK(stat.st_mode)) {
+               /*Accessing block device directly*/
+               info->size = 0;
+               if (ioctl(fd,BLKGETSIZE,&info->size)!=0) {
+                       DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image");
+                       return -EINVAL;
+               }
+
+               DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(info->size << SECTOR_SHIFT),
+                       (long long unsigned)info->size);
+
+               /*Get the sector size*/
+#if defined(BLKSSZGET)
+               {
+                       int arg;
+                       info->sector_size = DEFAULT_SECTOR_SIZE;
+                       ioctl(fd, BLKSSZGET, &info->sector_size);
+                       
+                       if (info->sector_size != DEFAULT_SECTOR_SIZE)
+                               DPRINTF("Note: sector size is %ld (not %d)\n",
+                                       info->sector_size, DEFAULT_SECTOR_SIZE);
+               }
+#else
+               info->sector_size = DEFAULT_SECTOR_SIZE;
+#endif
+
+       } else {
+               /*Local file? try fstat instead*/
+               info->size = (stat.st_size >> SECTOR_SHIFT);
+               info->sector_size = DEFAULT_SECTOR_SIZE;
+               DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(info->size << SECTOR_SHIFT),
+                       (long long unsigned)info->size);
+       }
+
+       if (info->size == 0) {          
+               info->size =((uint64_t) 16836057);
+               info->sector_size = DEFAULT_SECTOR_SIZE;
+       }
+       info->info = 0;
+
+       return 0;
+}
+
+/* Open the disk file and initialize aio state. */
+int tdaio_open(td_driver_t *driver, const char *name, td_flag_t flags)
+{
+       int i, fd, ret, o_flags;
+       struct tdaio_state *prv;
+
+       ret = 0;
+       prv = (struct tdaio_state *)driver->data;
+
+       DPRINTF("block-aio open('%s')", name);
+
+       memset(prv, 0, sizeof(struct tdaio_state));
+
+       prv->aio_free_count = MAX_AIO_REQS;
+       for (i = 0; i < MAX_AIO_REQS; i++)
+               prv->aio_free_list[i] = &prv->aio_requests[i];
+
+       /* Open the file */
+       o_flags = O_DIRECT | O_LARGEFILE | 
+               ((flags & TD_OPEN_RDONLY) ? O_RDONLY : O_RDWR);
+        fd = open(name, o_flags);
+
+        if ( (fd == -1) && (errno == EINVAL) ) {
+
+                /* Maybe O_DIRECT isn't supported. */
+               o_flags &= ~O_DIRECT;
+                fd = open(name, o_flags);
+                if (fd != -1) DPRINTF("WARNING: Accessing image without"
+                                     "O_DIRECT! (%s)\n", name);
+
+        } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name);
+       
+        if (fd == -1) {
+               DPRINTF("Unable to open [%s] (%d)!\n", name, 0 - errno);
+               ret = 0 - errno;
+               goto done;
+        }
+
+       ret = tdaio_get_image_info(fd, &driver->info);
+       if (ret) {
+               close(fd);
+               goto done;
+       }
+
+        prv->fd = fd;
+
+done:
+       return ret;     
+}
+
+void tdaio_complete(void *arg, struct tiocb *tiocb, int err)
+{
+       struct aio_request *aio = (struct aio_request *)arg;
+       struct tdaio_state *prv = aio->state;
+
+       td_complete_request(aio->treq, err);
+       prv->aio_free_list[prv->aio_free_count++] = aio;
+}
+
+void tdaio_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       int size;
+       uint64_t offset;
+       struct aio_request *aio;
+       struct tdaio_state *prv;
+
+       prv    = (struct tdaio_state *)driver->data;
+       size   = treq.secs * driver->info.sector_size;
+       offset = treq.sec  * (uint64_t)driver->info.sector_size;
+
+       if (prv->aio_free_count == 0)
+               goto fail;
+
+       aio        = prv->aio_free_list[--prv->aio_free_count];
+       aio->treq  = treq;
+       aio->state = prv;
+
+       td_prep_read(&aio->tiocb, prv->fd, treq.buf,
+                    size, offset, tdaio_complete, aio);
+       td_queue_tiocb(driver, &aio->tiocb);
+
+       return;
+
+fail:
+       td_complete_request(treq, -EBUSY);
+}
+
+void tdaio_queue_write(td_driver_t *driver, td_request_t treq)
+{
+       int size;
+       uint64_t offset;
+       struct aio_request *aio;
+       struct tdaio_state *prv;
+
+       prv     = (struct tdaio_state *)driver->data;
+       size    = treq.secs * driver->info.sector_size;
+       offset  = treq.sec  * (uint64_t)driver->info.sector_size;
+
+       if (prv->aio_free_count == 0)
+               goto fail;
+
+       aio        = prv->aio_free_list[--prv->aio_free_count];
+       aio->treq  = treq;
+       aio->state = prv;
+
+       td_prep_write(&aio->tiocb, prv->fd, treq.buf,
+                     size, offset, tdaio_complete, aio);
+       td_queue_tiocb(driver, &aio->tiocb);
+
+       return;
+
+fail:
+       td_complete_request(treq, -EBUSY);
+}
+
+int tdaio_close(td_driver_t *driver)
+{
+       struct tdaio_state *prv = (struct tdaio_state *)driver->data;
+       
+       close(prv->fd);
+
+       return 0;
+}
+
+int tdaio_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+       return TD_NO_PARENT;
+}
+
+int tdaio_validate_parent(td_driver_t *driver,
+                         td_driver_t *pdriver, td_flag_t flags)
+{
+       return -EINVAL;
+}
+
+struct tap_disk tapdisk_aio = {
+       .disk_type          = "tapdisk_aio",
+       .flags              = 0,
+       .private_data_size  = sizeof(struct tdaio_state),
+       .td_open            = tdaio_open,
+       .td_close           = tdaio_close,
+       .td_queue_read      = tdaio_queue_read,
+       .td_queue_write     = tdaio_queue_write,
+       .td_get_parent_id   = tdaio_get_parent_id,
+       .td_validate_parent = tdaio_validate_parent,
+       .td_debug           = NULL,
+};
diff -r f210a633571c -r 1c627434605e tools/blktap2/drivers/block-cache.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap2/drivers/block-cache.c       Tue May 26 11:52:31 2009 +0100
@@ -0,0 +1,787 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+
+#include "tapdisk.h"
+#include "tapdisk-utils.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+#include "tapdisk-interface.h"
+
+#ifdef DEBUG
+#define DBG(_f, _a...) tlog_write(TLOG_DBG, _f, ##_a)
+#else
+#define DBG(_f, _a...) ((void)0)
+#endif
+
+#define WARN(_f, _a...) tlog_write(TLOG_WARN, _f, ##_a)
+
+#define RADIX_TREE_PAGE_SHIFT           12 /* 4K pages */
+#define RADIX_TREE_PAGE_SIZE            (1 << RADIX_TREE_PAGE_SHIFT)
+
+#define RADIX_TREE_NODE_SHIFT           9 /* 512B nodes */
+#define RADIX_TREE_NODE_SIZE            (1 << RADIX_TREE_NODE_SHIFT)
+#define RADIX_TREE_NODE_MASK            (RADIX_TREE_NODE_SIZE - 1)
+
+#define BLOCK_CACHE_NODES_PER_PAGE      (1 << (RADIX_TREE_PAGE_SHIFT - 
RADIX_TREE_NODE_SHIFT))
+
+#define BLOCK_CACHE_MAX_SIZE            (10 << 20) /* 100MB cache */
+#define BLOCK_CACHE_REQUESTS            (TAPDISK_DATA_REQUESTS << 3)
+#define BLOCK_CACHE_PAGE_IDLETIME       60
+
+typedef struct radix_tree               radix_tree_t;
+typedef struct radix_tree_node          radix_tree_node_t;
+typedef struct radix_tree_link          radix_tree_link_t;
+typedef struct radix_tree_leaf          radix_tree_leaf_t;
+typedef struct radix_tree_page          radix_tree_page_t;
+
+typedef struct block_cache              block_cache_t;
+typedef struct block_cache_request      block_cache_request_t;
+typedef struct block_cache_stats        block_cache_stats_t;
+
+struct radix_tree_page {
+       char                           *buf;
+       size_t                          size;
+       uint64_t                        sec;
+       radix_tree_link_t              *owners[BLOCK_CACHE_NODES_PER_PAGE];
+};
+
+struct radix_tree_leaf {
+       radix_tree_page_t              *page;
+       char                           *buf;
+};
+
+struct radix_tree_link {
+       uint32_t                        time;
+       union {
+               radix_tree_node_t      *next;
+               radix_tree_leaf_t       leaf;
+       } u;
+};
+
+struct radix_tree_node {
+       int                             height;
+       radix_tree_link_t               links[RADIX_TREE_NODE_SIZE];
+};
+
+struct radix_tree {
+       int                             height;
+       uint64_t                        size;
+       uint32_t                        nodes;
+       radix_tree_node_t              *root;
+
+       block_cache_t                  *cache;
+};
+
+struct block_cache_request {
+       int                             err;
+       char                           *buf;
+       uint64_t                        secs;
+       td_request_t                    treq;
+       block_cache_t                  *cache;
+};
+
+struct block_cache_stats {
+       uint64_t                        reads;
+       uint64_t                        hits;
+       uint64_t                        misses;
+       uint64_t                        prunes;
+};
+
+struct block_cache {
+       int                             ptype;
+       char                           *name;
+
+       uint64_t                        sectors;
+
+       block_cache_request_t           requests[BLOCK_CACHE_REQUESTS];
+       block_cache_request_t          *request_free_list[BLOCK_CACHE_REQUESTS];
+       int                             requests_free;
+
+       event_id_t                      timeout_id;
+
+       radix_tree_t                    tree;
+
+       block_cache_stats_t             stats;
+};
+
+static inline uint64_t
+radix_tree_calculate_size(int height)
+{
+       return (uint64_t)RADIX_TREE_NODE_SIZE <<
+         (height * RADIX_TREE_NODE_SHIFT);
+}
+
+static inline int
+radix_tree_calculate_height(uint64_t sectors)
+{
+       int height;
+       uint64_t tree_size;
+
+       height = 1;  /* always allocate root node */
+       tree_size = radix_tree_calculate_size(height);
+       while (sectors > tree_size)
+               tree_size = radix_tree_calculate_size(++height);
+
+       return height;
+}
+
+static inline int
+radix_tree_index(radix_tree_node_t *node, uint64_t sector)
+{
+       return ((sector >> (node->height * RADIX_TREE_NODE_SHIFT)) &
+               RADIX_TREE_NODE_MASK);
+}
+
+static inline int
+radix_tree_node_contains_leaves(radix_tree_t *tree, radix_tree_node_t *node)
+{
+       return (node->height == 0);
+}
+
+static inline int
+radix_tree_node_is_root(radix_tree_t *tree, radix_tree_node_t *node)
+{
+       return (node->height == tree->height);
+}
+
+static inline uint64_t
+radix_tree_size(radix_tree_t *tree)
+{
+       return tree->size + tree->nodes * sizeof(radix_tree_node_t);
+}
+
+static inline void
+radix_tree_clear_link(radix_tree_link_t *link)
+{
+       if (link)
+               memset(link, 0, sizeof(radix_tree_link_t));
+}
+
+static inline radix_tree_node_t *
+radix_tree_allocate_node(radix_tree_t *tree, int height)
+{
+       radix_tree_node_t *node;
+
+       node = calloc(1, sizeof(radix_tree_node_t));
+       if (!node)
+               return NULL;
+
+       node->height = height;
+       tree->nodes++;
+
+       return node;
+}
+
+static inline radix_tree_node_t *
+radix_tree_allocate_child_node(radix_tree_t *tree, radix_tree_node_t *parent)
+{
+       return radix_tree_allocate_node(tree, parent->height - 1);
+}
+
+void
+radix_tree_free_node(radix_tree_t *tree, radix_tree_node_t *node)
+{
+       if (!node)
+               return;
+
+       free(node);
+       tree->nodes--;
+}
+
+static inline radix_tree_page_t *
+radix_tree_allocate_page(radix_tree_t *tree,
+                        char *buf, uint64_t sec, size_t size)
+{
+       radix_tree_page_t *page;
+
+       page = calloc(1, sizeof(radix_tree_page_t));
+       if (!page)
+               return NULL;
+
+       page->buf   = buf;
+       page->sec   = sec;
+       page->size  = size;
+       tree->size += size;
+
+       return page;
+}
+
+static inline void
+radix_tree_free_page(radix_tree_t *tree, radix_tree_page_t *page)
+{
+       int i;
+
+       for (i = 0; i < page->size >> RADIX_TREE_NODE_SHIFT; i++)
+               DBG("%s: ejecting sector 0x%llx\n",
+                   tree->cache->name, page->sec + i);
+
+       tree->cache->stats.prunes += (page->size >> RADIX_TREE_NODE_SHIFT);
+       tree->size -= page->size;
+       free(page->buf);
+       free(page);
+}
+
+/*
+ * remove a leaf and the shared radix_tree_page_t containing its buffer.
+ * leaves are deleted, nodes are not; gc will reap the nodes later.
+ */
+static void
+radix_tree_remove_page(radix_tree_t *tree, radix_tree_page_t *page)
+{
+       int i;
+
+       if (!page)
+               return;
+
+       for (i = 0; i < BLOCK_CACHE_NODES_PER_PAGE; i++)
+               radix_tree_clear_link(page->owners[i]);
+
+       radix_tree_free_page(tree, page);
+}
+
+static void
+radix_tree_insert_leaf(radix_tree_t *tree, radix_tree_link_t *link,
+                      radix_tree_page_t *page, off_t off)
+{
+       int i;
+
+       if (off + RADIX_TREE_NODE_SIZE > page->size)
+               return;
+
+       for (i = 0; i < BLOCK_CACHE_NODES_PER_PAGE; i++) {
+               if (page->owners[i])
+                       continue;
+
+               page->owners[i]   = link;
+               link->u.leaf.page = page;
+               link->u.leaf.buf  = page->buf + off;
+
+               break;
+       }
+}
+
+static char *
+radix_tree_find_leaf(radix_tree_t *tree, uint64_t sector)
+{
+       int idx;
+       struct timeval now;
+       radix_tree_link_t *link;
+       radix_tree_node_t *node;
+
+       node = tree->root;
+       gettimeofday(&now, NULL);
+
+       do {
+               idx        = radix_tree_index(node, sector);
+               link       = node->links + idx;
+               link->time = now.tv_sec;
+
+               if (radix_tree_node_contains_leaves(tree, node))
+                       return link->u.leaf.buf;
+
+               if (!link->u.next)
+                       return NULL;
+
+               node = link->u.next;
+       } while (1);
+}
+
+static char *
+radix_tree_add_leaf(radix_tree_t *tree, uint64_t sector,
+                   radix_tree_page_t *page, off_t off)
+{
+       int idx;
+       struct timeval now;
+       radix_tree_link_t *link;
+       radix_tree_node_t *node;
+
+       node = tree->root;
+       gettimeofday(&now, NULL);
+
+       do {
+               idx        = radix_tree_index(node, sector);
+               link       = node->links + idx;
+               link->time = now.tv_sec;
+
+               if (radix_tree_node_contains_leaves(tree, node)) {
+                       radix_tree_remove_page(tree, link->u.leaf.page);
+                       radix_tree_insert_leaf(tree, link, page, off);
+                       return link->u.leaf.buf;
+               }
+
+               if (!link->u.next) {
+                       link->u.next = radix_tree_allocate_child_node(tree,
+                                                                     node);
+                       if (!link->u.next)
+                               return NULL;
+               }
+
+               node = link->u.next;
+       } while (1);
+}
+
+static int
+radix_tree_add_leaves(radix_tree_t *tree, char *buf,
+                     uint64_t sector, uint64_t sectors)
+{
+       int i;
+       radix_tree_page_t *page;
+
+       page = radix_tree_allocate_page(tree, buf, sector,
+                                       sectors << RADIX_TREE_NODE_SHIFT);
+       if (!page)
+               return -ENOMEM;
+
+       for (i = 0; i < sectors; i++)
+               if (!radix_tree_add_leaf(tree, sector + i, 
+                                        page, (i << RADIX_TREE_NODE_SHIFT)))
+                       goto fail;
+
+       return 0;
+
+fail:
+       page->buf = NULL;
+       radix_tree_remove_page(tree, page);
+       return -ENOMEM;
+}
+
+static void
+radix_tree_delete_branch(radix_tree_t *tree, radix_tree_node_t *node)
+{
+       int i;
+       radix_tree_link_t *link;
+
+       if (!node)
+               return;
+
+       for (i = 0; i < RADIX_TREE_NODE_SIZE; i++) {
+               link = node->links + i;
+
+               if (radix_tree_node_contains_leaves(tree, node))
+                       radix_tree_remove_page(tree, link->u.leaf.page);
+               else
+                       radix_tree_delete_branch(tree, link->u.next);
+
+               radix_tree_clear_link(link);
+       }
+
+       radix_tree_free_node(tree, node);
+}
+
+static inline void
+radix_tree_destroy(radix_tree_t *tree)
+{
+       radix_tree_delete_branch(tree, tree->root);
+       tree->root = NULL;
+}
+
+/*
+ * returns 1 if @node is empty after pruning, 0 otherwise
+ */
+static int
+radix_tree_prune_branch(radix_tree_t *tree,
+                       radix_tree_node_t *node, uint32_t now)
+{
+       int i, empty;
+       radix_tree_link_t *link;
+
+       empty = 1;
+       if (!node)
+               return empty;
+
+       for (i = 0; i < RADIX_TREE_NODE_SIZE; i++) {
+               link = node->links + i;
+
+               if (now - link->time < BLOCK_CACHE_PAGE_IDLETIME) {
+                       if (radix_tree_node_contains_leaves(tree, node)) {
+                               empty = 0;
+                               continue;
+                       }
+
+                       if (radix_tree_prune_branch(tree, link->u.next, now))
+                               radix_tree_clear_link(link);
+                       else
+                               empty = 0;
+
+                       continue;
+               }
+
+               if (radix_tree_node_contains_leaves(tree, node))
+                       radix_tree_remove_page(tree, link->u.leaf.page);
+               else
+                       radix_tree_delete_branch(tree, link->u.next);
+
+               radix_tree_clear_link(link);
+       }
+
+       if (empty && !radix_tree_node_is_root(tree, node))
+               radix_tree_free_node(tree, node);
+
+       return empty;
+}
+
+/*
+ * walk tree and free any node that has been idle for too long
+ */
+static void
+radix_tree_prune(radix_tree_t *tree)
+{
+       struct timeval now;
+
+       if (!tree->root)
+               return;
+
+       DPRINTF("tree %s has %"PRIu64" bytes\n",
+               tree->cache->name, tree->size);
+
+       gettimeofday(&now, NULL);
+       radix_tree_prune_branch(tree, tree->root, now.tv_sec);
+
+       DPRINTF("tree %s now has %"PRIu64" bytes\n",
+               tree->cache->name, tree->size);
+}
+
+static inline int
+radix_tree_initialize(radix_tree_t *tree, uint64_t sectors)
+{
+       tree->height = radix_tree_calculate_height(sectors);
+       tree->root   = radix_tree_allocate_node(tree, tree->height);
+       if (!tree->root)
+               return -ENOMEM;
+
+       return 0;
+}
+
+static inline void
+radix_tree_free(radix_tree_t *tree)
+{
+       radix_tree_destroy(tree);
+}
+
+static void
+block_cache_prune_event(event_id_t id, char mode, void *private)
+{
+       radix_tree_t *tree;
+       block_cache_t *cache;
+
+       cache = (block_cache_t *)private;
+       tree  = &cache->tree;
+
+       radix_tree_prune(tree);
+}
+
+static inline block_cache_request_t *
+block_cache_get_request(block_cache_t *cache)
+{
+       if (!cache->requests_free)
+               return NULL;
+
+       return cache->request_free_list[--cache->requests_free];
+}
+
+static inline void
+block_cache_put_request(block_cache_t *cache, block_cache_request_t *breq)
+{
+       memset(breq, 0, sizeof(block_cache_request_t));
+       cache->request_free_list[cache->requests_free++] = breq;
+}
+
+static int
+block_cache_open(td_driver_t *driver, const char *name, td_flag_t flags)
+{
+       int i, err;
+       radix_tree_t *tree;
+       block_cache_t *cache;
+
+       if (!td_flag_test(flags, TD_OPEN_RDONLY))
+               return -EINVAL;
+
+       if (driver->info.sector_size != RADIX_TREE_NODE_SIZE)
+               return -EINVAL;
+
+       cache = (block_cache_t *)driver->data;
+       err   = tapdisk_namedup(&cache->name, (char *)name);
+       if (err)
+               return -ENOMEM;
+
+       cache->sectors = driver->info.size;
+
+       tree = &cache->tree;
+       err  = radix_tree_initialize(tree, cache->sectors);
+       if (err)
+               goto fail;
+
+       tree->cache = cache;
+       cache->requests_free = BLOCK_CACHE_REQUESTS;
+       for (i = 0; i < BLOCK_CACHE_REQUESTS; i++)
+               cache->request_free_list[i] = cache->requests + i;
+
+       cache->timeout_id = 
tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT,
+                                                         -1, /* dummy fd */
+                                                         
BLOCK_CACHE_PAGE_IDLETIME << 1,
+                                                         
block_cache_prune_event,
+                                                         cache);
+       if (cache->timeout_id < 0)
+               goto fail;
+
+       DPRINTF("opening cache for %s, sectors: %"PRIu64", "
+               "tree: %p, height: %d\n",
+               cache->name, cache->sectors, tree, tree->height);
+
+       if (mlockall(MCL_CURRENT | MCL_FUTURE))
+               DPRINTF("mlockall failed: %d\n", -errno);
+
+       return 0;
+
+fail:
+       free(cache->name);
+       radix_tree_free(&cache->tree);
+       return err;
+}
+
+static int
+block_cache_close(td_driver_t *driver)
+{
+       radix_tree_t *tree;
+       block_cache_t *cache;
+
+       cache = (block_cache_t *)driver->data;
+       tree  = &cache->tree;
+
+       DPRINTF("closing cache for %s\n", cache->name);
+
+       tapdisk_server_unregister_event(cache->timeout_id);
+       radix_tree_free(tree);
+       free(cache->name);
+
+       return 0;
+}
+
+static inline uint64_t
+block_cache_hash(block_cache_t *cache, char *buf)
+{
+       int i, n;
+       uint64_t cksm, *data;
+
+       return 0;
+
+       cksm = 0;
+       data = (uint64_t *)buf;
+       n    = RADIX_TREE_NODE_SIZE / sizeof(uint64_t);
+
+       for (i = 0; i < n; i++)
+               cksm += data[i];
+
+       return ~cksm;
+}
+
+static void
+block_cache_hit(block_cache_t *cache, td_request_t treq, char *iov[])
+{
+       int i;
+       off_t off;
+
+       cache->stats.hits += treq.secs;
+
+       for (i = 0; i < treq.secs; i++) {
+               DBG("%s: block cache hit: sec 0x%08llx, hash: 0x%08llx\n",
+                   cache->name, treq.sec + i, block_cache_hash(cache, iov[i]));
+
+               off = i << RADIX_TREE_NODE_SHIFT;
+               memcpy(treq.buf + off, iov[i], RADIX_TREE_NODE_SIZE);
+       }
+
+       td_complete_request(treq, 0);
+}
+
+static void
+block_cache_populate_cache(td_request_t clone, int err)
+{
+       int i;
+       radix_tree_t *tree;
+       block_cache_t *cache;
+       block_cache_request_t *breq;
+
+       breq        = (block_cache_request_t *)clone.cb_data;
+       cache       = breq->cache;
+       tree        = &cache->tree;
+       breq->secs -= clone.secs;
+       breq->err   = (breq->err ? breq->err : err);
+
+       if (breq->secs)
+               return;
+
+       if (breq->err) {
+               free(breq->buf);
+               goto out;
+       }
+
+       for (i = 0; i < breq->treq.secs; i++) {
+               off_t off = i << RADIX_TREE_NODE_SHIFT;
+               DBG("%s: populating sec 0x%08llx\n",
+                   cache->name, breq->treq.sec + i);
+               memcpy(breq->treq.buf + off,
+                      breq->buf + off, RADIX_TREE_NODE_SIZE);
+       }
+
+       if (radix_tree_add_leaves(tree, breq->buf,
+                                 breq->treq.sec, breq->treq.secs))
+               free(breq->buf);
+
+out:
+       td_complete_request(breq->treq, breq->err);
+       block_cache_put_request(cache, breq);
+}
+
+static void
+block_cache_miss(block_cache_t *cache, td_request_t treq)
+{
+       char *buf;
+       size_t size;
+       td_request_t clone;
+       radix_tree_t *tree;
+       block_cache_request_t *breq;
+
+       DBG("%s: block cache miss: sec 0x%08llx\n", cache->name, treq.sec);
+
+       clone = treq;
+       tree  = &cache->tree;
+       size  = treq.secs << RADIX_TREE_NODE_SHIFT;
+
+       cache->stats.misses += treq.secs;
+
+       if (radix_tree_size(tree) + size >= BLOCK_CACHE_MAX_SIZE)
+               goto out;
+
+       breq = block_cache_get_request(cache);
+       if (!breq)
+               goto out;
+
+       if (posix_memalign((void **)&buf, RADIX_TREE_NODE_SIZE, size)) {
+               block_cache_put_request(cache, breq);
+               goto out;
+       }
+
+       breq->treq    = treq;
+       breq->secs    = treq.secs;
+       breq->err     = 0;
+       breq->buf     = buf;
+       breq->cache   = cache;
+
+       clone.buf     = buf;
+       clone.cb      = block_cache_populate_cache;
+       clone.cb_data = breq;
+
+out:
+       td_forward_request(clone);
+}
+
+static void
+block_cache_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       int i;
+       radix_tree_t *tree;
+       block_cache_t *cache;
+       char *iov[BLOCK_CACHE_NODES_PER_PAGE];
+
+       cache = (block_cache_t *)driver->data;
+       tree  = &cache->tree;
+
+       cache->stats.reads += treq.secs;
+
+       if (treq.secs > BLOCK_CACHE_NODES_PER_PAGE)
+               return td_forward_request(treq);
+
+       for (i = 0; i < treq.secs; i++) {
+               iov[i] = radix_tree_find_leaf(tree, treq.sec + i);
+               if (!iov[i])
+                       return block_cache_miss(cache, treq);
+       }
+
+       return block_cache_hit(cache, treq, iov);
+}
+
+static void
+block_cache_queue_write(td_driver_t *driver, td_request_t treq)
+{
+       td_complete_request(treq, -EPERM);
+}
+
+static int
+block_cache_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+       return -EINVAL;
+}
+
+static int
+block_cache_validate_parent(td_driver_t *driver,
+                           td_driver_t *pdriver, td_flag_t flags)
+{
+       block_cache_t *cache;
+
+       if (!td_flag_test(pdriver->state, TD_DRIVER_RDONLY))
+               return -EINVAL;
+
+       cache = (block_cache_t *)driver->data;
+       if (strcmp(driver->name, pdriver->name))
+               return -EINVAL;
+
+       return 0;
+}
+
+static void
+block_cache_debug(td_driver_t *driver)
+{
+       block_cache_t *cache;
+       block_cache_stats_t *stats;
+
+       cache = (block_cache_t *)driver->data;
+       stats = &cache->stats;
+
+       WARN("BLOCK CACHE %s\n", cache->name);
+       WARN("reads: %"PRIu64", hits: %"PRIu64", misses: %"PRIu64", prunes: 
%"PRIu64"\n",
+            stats->reads, stats->hits, stats->misses, stats->prunes);
+}
+
+struct tap_disk tapdisk_block_cache = {
+       .disk_type                  = "tapdisk_block_cache",
+       .flags                      = 0,
+       .private_data_size          = sizeof(block_cache_t),
+       .td_open                    = block_cache_open,
+       .td_close                   = block_cache_close,
+       .td_queue_read              = block_cache_queue_read,
+       .td_queue_write             = block_cache_queue_write,
+       .td_get_parent_id           = block_cache_get_parent_id,
+       .td_validate_parent         = block_cache_validate_parent,
+       .td_debug                   = block_cache_debug,
+};
diff -r f210a633571c -r 1c627434605e tools/blktap2/drivers/block-log.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap2/drivers/block-log.c Tue May 26 11:52:31 2009 +0100
@@ -0,0 +1,688 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Driver to sit on top of another disk and log writes, in order
+ * to synchronize two distinct disks
+ *
+ * On receipt of a control request it can export a list of dirty
+ * sectors in the following format:
+ * struct writerange {
+ *   u64 sector;
+ *   u32 count;
+ * }
+ * terminated by { 0, 0 }
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "log.h"
+#include "tapdisk.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+
+#define MAX_CONNECTIONS 1
+
+typedef struct poll_fd {
+  int          fd;
+  event_id_t   id;
+} poll_fd_t;
+
+struct tdlog_state {
+  uint64_t     size;
+
+  void*        writelog;
+
+  char*        ctlpath;
+  poll_fd_t    ctl;
+
+  int          connected;
+  poll_fd_t    connections[MAX_CONNECTIONS];
+
+  char*        shmpath;
+  void*        shm;
+
+  log_sring_t* sring;
+  log_back_ring_t bring;
+};
+
+#define BDPRINTF(_f, _a...) syslog (LOG_DEBUG, "log: " _f "\n", ## _a)
+
+#define BWPRINTF(_f, _a...) syslog (LOG_WARNING, "log: " _f "\n", ## _a)
+
+static void ctl_accept(event_id_t, char, void *);
+static void ctl_request(event_id_t, char, void *);
+
+/* -- write log -- */
+
+/* large flat bitmaps don't scale particularly well either in size or scan
+ * time, but they'll do for now */
+#define BITS_PER_LONG (sizeof(unsigned long) * 8)
+#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
+
+#define BITMAP_ENTRY(_nr, _bmap) ((unsigned long*)(_bmap))[(_nr)/BITS_PER_LONG]
+#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
+
+static inline int test_bit(int nr, void* bmap)
+{
+  return (BITMAP_ENTRY(nr, bmap) >> BITMAP_SHIFT(nr)) & 1;
+}
+
+static inline void clear_bit(int nr, void* bmap)
+{
+  BITMAP_ENTRY(nr, bmap) &= ~(1UL << BITMAP_SHIFT(nr));
+}
+
+static inline void set_bit(int nr, void* bmap)
+{
+  BITMAP_ENTRY(nr, bmap) |= (1UL << BITMAP_SHIFT(nr));
+}
+
+static inline int bitmap_size(uint64_t sz)
+{
+  return sz >> 3;
+}
+
+static int writelog_create(struct tdlog_state *s)
+{
+  uint64_t bmsize;
+
+  bmsize = bitmap_size(s->size);
+
+  BDPRINTF("allocating %"PRIu64" bytes for dirty bitmap", bmsize);
+
+  if (!(s->writelog = calloc(bmsize, 1))) {
+    BWPRINTF("could not allocate dirty bitmap of size %"PRIu64, bmsize);
+    return -1;
+  }
+
+  return 0;
+}
+
+static int writelog_free(struct tdlog_state *s)
+{
+  if (s->writelog)
+    free(s->writelog);
+
+  return 0;
+}
+
+static int writelog_set(struct tdlog_state* s, uint64_t sector, int count)
+{
+  int i;
+
+  for (i = 0; i < count; i++) 
+    set_bit(sector + i, s->writelog);
+
+  return 0;
+}
+
+/* if end is 0, clear to end of disk */
+int writelog_clear(struct tdlog_state* s, uint64_t start, uint64_t end)
+{
+  if (!end)
+    end = s->size;
+
+  /* clear to word boundaries */
+  while (BITMAP_SHIFT(start))
+    clear_bit(start++, s->writelog);
+  while (BITMAP_SHIFT(end))
+    clear_bit(end--, s->writelog);
+
+  memset(s->writelog + start / BITS_PER_LONG, 0, (end - start) >> 3);
+
+  return 0;
+}
+
+/* returns last block exported (may not be end of disk if shm region
+ * overflows) */
+static uint64_t writelog_export(struct tdlog_state* s)
+{
+  struct disk_range* range = s->shm;
+  uint64_t i = 0;
+
+  BDPRINTF("sector count: %"PRIu64, s->size);
+
+  for (i = 0; i < s->size; i++) {
+    if (test_bit(i, s->writelog)) {
+      /* range start */
+      range->sector = i;
+      range->count = 1;
+      /* find end */
+      for (i++; i < s->size && test_bit(i, s->writelog); i++)
+       range->count++;
+
+      BDPRINTF("export: dirty extent %"PRIu64":%u",
+              range->sector, range->count);
+      range++;
+
+      /* out of space in shared memory region */
+      if ((void*)range >= bmend(s->shm)) {
+       BDPRINTF("out of space in shm region at sector %"PRIu64, i);
+       return i;
+      }
+
+      /* undo forloop increment */
+      i--;
+    }
+  }
+
+  /* NULL-terminate range list */
+  range->sector = 0;
+  range->count = 0;
+
+  return i;
+}
+
+/* -- communication channel -- */
+
+/* remove FS special characters in up to len bytes of path */
+static inline void path_escape(char* path, size_t len) {
+  int i;
+
+  for (i = 0; i < len && path[i]; i++)
+    if (strchr(":/", path[i]))
+      path[i] = '_';
+}
+
+static char* ctl_makepath(const char* name, const char* ext)
+{
+  char* res;
+  char *file;
+
+  file = strrchr(name, '/');
+  if (!file) {
+    BWPRINTF("invalid name %s\n", name);
+    return NULL;
+  }
+
+  if (asprintf(&res, BLKTAP_CTRL_DIR "/log_%s.%s", file, ext) < 0) {
+    BWPRINTF("could not allocate path");
+    return NULL;
+  }
+
+  path_escape(res + strlen(BLKTAP_CTRL_DIR) + 5, strlen(file));
+
+  return res;
+}
+
+static int shmem_open(struct tdlog_state* s, const char* name)
+{
+  int i, l, fd;
+
+  /* device name -> path */
+  if (asprintf(&s->shmpath, "/log_%s.wlog", name) < 0) {
+    BWPRINTF("could not allocate shm path");
+    return -1;
+  }
+
+  path_escape(s->shmpath + 5, strlen(name));
+
+  if ((fd = shm_open(s->shmpath, O_CREAT|O_RDWR, 0750)) < 0) {
+    BWPRINTF("could not open shared memory file %s: %s", s->shmpath,
+            strerror(errno));
+    goto err;
+  }
+  if (ftruncate(fd, SHMSIZE) < 0) {
+    BWPRINTF("error truncating shmem to size %u", SHMSIZE);
+    close(fd);
+    goto err;
+  }
+
+  s->shm = mmap(NULL, SHMSIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+  close(fd);
+  if (s->shm == MAP_FAILED) {
+    BWPRINTF("could not mmap write log shm: %s", strerror(errno));
+    goto err;
+  }
+  return 0;
+
+  err:
+  s->shm = NULL;
+  free(s->shmpath);
+  s->shmpath = NULL;
+  return -1;
+}
+
+static int shmem_close(struct tdlog_state* s)
+{
+  if (s->shm) {
+    munmap(s->shm, SHMSIZE);
+    s->shm = NULL;
+  }
+
+  if (s->shmpath) {
+    shm_unlink(s->shmpath);
+    s->shmpath = NULL;
+  }
+
+  return 0;
+}
+
+/* control socket */
+
+static int ctl_open(struct tdlog_state* s, const char* name)
+{
+  struct sockaddr_un saddr;
+
+  if (!(s->ctlpath = ctl_makepath(name, "ctl")))
+    return -1;
+
+  if ((s->ctl.fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
+    BWPRINTF("error opening control socket: %s", strerror(errno));
+    goto err;
+  }
+
+  memset(&saddr, 0, sizeof(saddr));
+  saddr.sun_family = AF_UNIX;
+  memcpy(saddr.sun_path, s->ctlpath, strlen(s->ctlpath));
+  if (unlink(s->ctlpath) && errno != ENOENT) {
+    BWPRINTF("error unlinking old socket path %s: %s", s->ctlpath,
+            strerror(errno));
+    goto err_sock;
+  }
+    
+  if (bind(s->ctl.fd, &saddr, sizeof(saddr)) < 0) {
+    BWPRINTF("error binding control socket to %s: %s", s->ctlpath,
+            strerror(errno));
+    goto err_sock;
+  }
+
+  if (listen(s->ctl.fd, 1) < 0) {
+    BWPRINTF("error listening on control socket: %s", strerror(errno));
+    goto err_sock;
+  }
+
+  s->ctl.id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                                           s->ctl.fd, 0, ctl_accept, s);
+  if (s->ctl.id < 0) {
+    BWPRINTF("error register event handler: %s", strerror(s->ctl.id));
+    goto err_sock;
+  }
+
+  return 0;
+
+  err_sock:
+  close(s->ctl.fd);
+  s->ctl.fd = -1;
+  err:
+  free(s->ctlpath);
+  s->ctlpath = NULL;
+
+  return -1;
+}
+
+static int ctl_close(struct tdlog_state* s)
+{
+  while (s->connected) {
+    tapdisk_server_unregister_event(s->connections[s->connected].id);
+    close(s->connections[s->connected].fd);
+    s->connections[s->connected].fd = -1;
+    s->connections[s->connected].id = 0;
+    s->connected--;
+  }
+
+  if (s->ctl.fd >= 0) {
+    tapdisk_server_unregister_event(s->ctl.id);
+    close(s->ctl.fd);
+    s->ctl.fd = -1;
+    s->ctl.id = 0;
+  }
+
+  if (s->ctlpath) {
+    unlink(s->ctlpath);
+    free(s->ctlpath);
+    s->ctlpath = NULL;
+  }
+
+  /* XXX this must be fixed once requests are actually in flight */
+  /* could just drain the existing ring here first */
+  if (s->sring) {
+    SHARED_RING_INIT(s->sring);
+    BACK_RING_INIT(&s->bring, s->sring, SRINGSIZE);
+  }
+
+  return 0;
+}
+
+/* walk list of open sockets, close matching fd */
+static int ctl_close_sock(struct tdlog_state* s, int fd)
+{
+  int i;
+
+  for (i = 0; i <= s->connected; i++) {
+    if (s->connections[i].fd == fd) {
+      tapdisk_server_unregister_event(s->connections[i].id);
+      close(s->connections[i].fd);
+      s->connections[i].fd = -1;
+      s->connections[i].id = 0;
+      s->connected--;
+      return 0;
+    }
+  }
+
+  BWPRINTF("requested to close unknown socket %d", fd);
+  return -1;
+}
+
+static void ctl_accept(event_id_t id, char mode, void *private)
+{
+  struct tdlog_state* s = (struct tdlog_state *)private;
+  int fd;
+  event_id_t cid;
+
+  if ((fd = accept(s->ctl.fd, NULL, NULL)) < 0) {
+    BWPRINTF("error accepting control connection: %s", strerror(errno));
+    return;
+  }
+
+  if (s->connected) {
+    BWPRINTF("control session in progress, closing new connection");
+    close(fd);
+    return;
+  }
+
+  cid = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                                     fd, 0, ctl_request, s);
+  if (cid < 0) {
+    BWPRINTF("error registering connection event handler: %s", strerror(cid));
+    close(fd);
+    return;
+  }
+
+  s->connections[s->connected].fd = fd;
+  s->connections[s->connected].id = cid;
+  s->connected++;
+}
+
+/* response format: 4 bytes shmsize, 0-terminated path */
+static int ctl_get_shmpath(struct tdlog_state* s, int fd)
+{
+  char msg[CTLRSPLEN_SHMP + 1];
+  uint32_t sz;
+  int rc;
+
+  BDPRINTF("ctl: sending shared memory parameters (size: %u, path: %s)",
+          SHMSIZE, s->shmpath);
+
+  /* TMP: sanity-check shm */
+  sz = 0xdeadbeef;
+  memcpy(s->shm, &sz, sizeof(sz));
+
+  sz = SHMSIZE;
+  memcpy(msg, &sz, sizeof(sz));
+  snprintf(msg + sizeof(sz), sizeof(msg) - sizeof(sz), "%s", s->shmpath);
+  if ((rc = write(fd, msg, CTLRSPLEN_SHMP)) < 0) {
+    BWPRINTF("error writing shmpath: %s", strerror(errno));
+    return -1;
+  }
+
+  return 0;
+}
+
+static int ctl_peek_writes(struct tdlog_state* s, int fd)
+{
+  int rc;
+
+  BDPRINTF("ctl: peeking bitmap");
+
+  writelog_export(s);
+
+  if ((rc = write(fd, "done", CTLRSPLEN_PEEK)) < 0) {
+    BWPRINTF("error writing peek ack: %s", strerror(errno));
+    return -1;
+  }
+
+  return 0;
+}
+
+static int ctl_clear_writes(struct tdlog_state* s, int fd)
+{
+  int rc;
+
+  BDPRINTF("ctl: clearing bitmap");
+
+  writelog_clear(s, 0, 0);
+
+  if ((rc = write(fd, "done", CTLRSPLEN_CLEAR)) < 0) {
+    BWPRINTF("error writing clear ack: %s", strerror(errno));
+    return -1;
+  }
+
+  return 0;
+}
+
+/* get dirty bitmap and clear it atomically */
+static int ctl_get_writes(struct tdlog_state* s, int fd)
+{
+  int rc;
+
+  BDPRINTF("ctl: getting bitmap");
+
+  writelog_export(s);
+  writelog_clear(s, 0, 0);
+
+  if ((rc = write(fd, "done", CTLRSPLEN_GET)) < 0) {
+    BWPRINTF("error writing get ack: %s", strerror(errno));
+    return -1;
+  }
+
+  return 0;
+}
+
+/* get requests from ring */
+static int ctl_kick(struct tdlog_state* s, int fd)
+{
+  RING_IDX reqstart, reqend;
+  log_request_t req;
+
+  /* XXX testing */
+  RING_IDX rspstart, rspend;
+  log_response_t rsp;
+  struct log_ctlmsg msg;
+  int rc;
+
+  reqstart = s->bring.req_cons;
+  reqend = s->sring->req_prod;
+
+  BDPRINTF("ctl: ring kicked (start = %u, end = %u)", reqstart, reqend);
+
+  while (reqstart != reqend) {
+    /* XXX actually submit these! */
+    memcpy(&req, RING_GET_REQUEST(&s->bring, reqstart), sizeof(req));
+    BDPRINTF("ctl: read request %"PRIu64":%u", req.sector, req.count);
+    s->bring.req_cons = ++reqstart;
+
+    rsp.sector = req.sector;
+    rsp.count = req.count;
+    memcpy(RING_GET_RESPONSE(&s->bring, s->bring.rsp_prod_pvt), &rsp,
+          sizeof(rsp));
+    s->bring.rsp_prod_pvt++;
+  }
+
+  RING_PUSH_RESPONSES(&s->bring);
+  memset(&msg, 0, sizeof(msg));
+  memcpy(msg.msg, LOGCMD_KICK, 4);
+  if ((rc = write(fd, &msg, sizeof(msg))) < 0) {
+    BWPRINTF("error sending notify: %s", strerror(errno));
+    return -1;
+  } else if (rc < sizeof(msg)) {
+    BWPRINTF("short notify write (%d/%zd)", rc, sizeof(msg));
+    return -1;
+  }
+
+  return 0;
+}
+
+static int ctl_do_request(struct tdlog_state* s, int fd, struct log_ctlmsg* 
msg)
+{
+  if (!strncmp(msg->msg, LOGCMD_SHMP, 4)) {
+    return ctl_get_shmpath(s, fd);
+  } else if (!strncmp(msg->msg, LOGCMD_PEEK, 4)) {
+    return ctl_peek_writes(s, fd);
+  } else if (!strncmp(msg->msg, LOGCMD_CLEAR, 4)) {
+    return ctl_clear_writes(s, fd);
+  } else if (!strncmp(msg->msg, LOGCMD_GET, 4)) {
+    return ctl_get_writes(s, fd);
+  } else if (!strncmp(msg->msg, LOGCMD_KICK, 4)) {
+    return ctl_kick(s, fd);
+  }
+
+  BWPRINTF("unknown control request %.4s", msg->msg);
+  return -1;
+}
+
+static inline int ctl_find_connection(struct tdlog_state *s, event_id_t id)
+{
+  int i;
+
+  for (i = 0; i < s->connected; i++)
+    if (s->connections[i].id == id)
+      return s->connections[i].fd;
+
+  BWPRINTF("unrecognized event callback id %d", id);
+  return -1;
+}
+
+static void ctl_request(event_id_t id, char mode, void *private)
+{
+  struct tdlog_state* s = (struct tdlog_state*)private;
+  struct log_ctlmsg msg;
+  int rc, i, fd = -1;
+
+  fd = ctl_find_connection(s, id);
+  if (fd == -1)
+    return;
+
+  if ((rc = read(fd, &msg, sizeof(msg))) < 0) {
+    BWPRINTF("error reading from ctl socket %d, closing: %s", fd,
+            strerror(errno));
+    ctl_close_sock(s, fd);
+    return;
+  } else if (rc == 0) {
+    BDPRINTF("ctl_request: EOF, closing socket");
+    ctl_close_sock(s, fd);
+    return;
+  } else if (rc < sizeof(msg)) {
+    BWPRINTF("short request received (%d/%zd bytes), ignoring", rc,
+            sizeof(msg));
+    return;
+  }
+
+  ctl_do_request(s, fd, &msg);
+}
+
+/* -- interface -- */
+
+static int tdlog_close(td_driver_t*);
+
+static int tdlog_open(td_driver_t* driver, const char* name, td_flag_t flags)
+{
+  struct tdlog_state* s = (struct tdlog_state*)driver->data;
+  int rc;
+
+  memset(s, 0, sizeof(*s));
+
+  s->size = driver->info.size;
+
+  if ((rc = writelog_create(s))) {
+    tdlog_close(driver);
+    return rc;
+  }
+  if ((rc = shmem_open(s, name))) {
+    tdlog_close(driver);
+    return rc;
+  }
+  if ((rc = ctl_open(s, name))) {
+    tdlog_close(driver);
+    return rc;
+  }
+
+  s->sring = (log_sring_t*)sringstart(s->shm);
+  SHARED_RING_INIT(s->sring);
+  BACK_RING_INIT(&s->bring, s->sring, SRINGSIZE);
+
+  BDPRINTF("opened ctl socket");
+
+  return 0;
+}
+
+static int tdlog_close(td_driver_t* driver)
+{
+  struct tdlog_state* s = (struct tdlog_state*)driver->data;
+
+  ctl_close(s);
+  shmem_close(s);
+  writelog_free(s);
+
+  return 0;
+}
+
+static void tdlog_queue_read(td_driver_t* driver, td_request_t treq)
+{
+  td_forward_request(treq);
+}
+
+static void tdlog_queue_write(td_driver_t* driver, td_request_t treq)
+{
+  struct tdlog_state* s = (struct tdlog_state*)driver->data;
+  int rc;
+
+  writelog_set(s, treq.sec, treq.secs);
+  td_forward_request(treq);
+}
+
+static int tdlog_get_parent_id(td_driver_t* driver, td_disk_id_t* id)
+{
+  return -EINVAL;
+}
+
+static int tdlog_validate_parent(td_driver_t *driver,
+                                td_driver_t *parent, td_flag_t flags)
+{
+  return 0;
+}
+
+struct tap_disk tapdisk_log = {
+  .disk_type          = "tapdisk_log",
+  .private_data_size  = sizeof(struct tdlog_state),
+  .flags              = 0,
+  .td_open            = tdlog_open,
+  .td_close           = tdlog_close,
+  .td_queue_read      = tdlog_queue_read,
+  .td_queue_write     = tdlog_queue_write,
+  .td_get_parent_id   = tdlog_get_parent_id,
+  .td_validate_parent = tdlog_validate_parent,
+};
diff -r f210a633571c -r 1c627434605e tools/blktap2/drivers/block-qcow.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap2/drivers/block-qcow.c        Tue May 26 11:52:31 2009 +0100
@@ -0,0 +1,1517 @@
+/* block-qcow.c
+ *
+ * Asynchronous Qemu copy-on-write disk implementation.
+ * Code based on the Qemu implementation
+ * (see copyright notice below)
+ *
+ * (c) 2006 Andrew Warfield and Julian Chesterfield
+ *
+ */
+
+/*
+ * Block driver for the QCOW format
+ * 
+ * Copyright (c) 2004 Fabrice Bellard
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files(the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <string.h>
+#include <zlib.h>
+#include <inttypes.h>
+#include <libaio.h>
+#include <openssl/md5.h>
+#include "bswap.h"
+#include "aes.h"
+
+#include "tapdisk.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+#include "qcow.h"
+#include "blk.h"
+#include "atomicio.h"
+
+/* *BSD has no O_LARGEFILE */
+#ifndef O_LARGEFILE
+#define O_LARGEFILE     0
+#endif
+
+#if 1
+#define ASSERT(_p) \
+    if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s", #_p , \
+    __LINE__, __FILE__); *(int*)0=0; }
+#else
+#define ASSERT(_p) ((void)0)
+#endif
+
+struct pending_aio {
+        td_callback_t cb;
+        int id;
+        void *private;
+       int nb_sectors;
+       char *buf;
+       uint64_t sector;
+};
+
+#undef IOCB_IDX
+#define IOCB_IDX(_s, _io) ((_io) - (_s)->iocb_list)
+
+#define ZERO_TEST(_b) (_b | 0x00)
+
+struct qcow_request {
+       td_request_t         treq;
+       struct tiocb         tiocb;
+       struct tdqcow_state  *state;
+};
+
+static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset);
+
+#ifdef USE_GCRYPT
+
+#include <gcrypt.h>
+
+uint32_t gen_cksum(char *ptr, int len)
+{
+  int i;
+  uint32_t md[4];
+
+  /* Generate checksum */
+  gcry_md_hash_buffer(GCRY_MD_MD5, md, ptr, len);
+
+  return md[0];
+}
+
+#else /* use libcrypto */
+
+#include <openssl/md5.h>
+
+uint32_t gen_cksum(char *ptr, int len)
+{
+  int i;
+  unsigned char *md;
+  uint32_t ret;
+
+  md = malloc(MD5_DIGEST_LENGTH);
+  if(!md) return 0;
+
+  /* Generate checksum */
+  if (MD5((unsigned char *)ptr, len, md) != md)
+    ret = 0;
+  else
+    memcpy(&ret, md, sizeof(uint32_t));
+
+  free(md);
+  return ret;
+}
+
+#endif
+
+
+static void free_aio_state(struct tdqcow_state* s)
+{
+       free(s->aio_requests);
+       free(s->aio_free_list);
+}
+
+static int init_aio_state(td_driver_t *driver)
+{
+       int i, ret;
+       td_disk_info_t *bs = &(driver->info);
+       struct tdqcow_state   *s  = (struct tdqcow_state *)driver->data;
+       
+        // A segment (i.e. a page) can span multiple clusters
+        s->max_aio_reqs = ((getpagesize() / s->cluster_size) + 1) *
+         MAX_SEGMENTS_PER_REQ * MAX_REQUESTS;
+
+       s->aio_free_count = s->max_aio_reqs;
+
+       if (!(s->aio_requests  = calloc(s->max_aio_reqs, sizeof(struct 
qcow_request))) || 
+           !(s->aio_free_list = calloc(s->max_aio_reqs, sizeof(struct 
qcow_request)))) {
+           DPRINTF("Failed to allocate AIO structs (max_aio_reqs = %d)\n",
+                   s->max_aio_reqs);
+           goto fail;
+       }
+
+       for (i = 0; i < s->max_aio_reqs; i++)
+               s->aio_free_list[i] = &s->aio_requests[i];
+
+        DPRINTF("AIO state initialised\n");
+
+        return 0;
+ fail:
+       return -1;
+}
+
+int get_filesize(char *filename, uint64_t *size, struct stat *st)
+{
+       int fd;
+       QCowHeader header;
+
+       /*Set to the backing file size*/
+       fd = open(filename, O_RDONLY);
+       if (fd < 0)
+               return -1;
+       if (read(fd, &header, sizeof(header)) < sizeof(header)) {
+               close(fd);
+               return -1;
+       }
+       close(fd);
+       
+       be32_to_cpus(&header.magic);
+       be64_to_cpus(&header.size);
+       if (header.magic == QCOW_MAGIC) {
+               *size = header.size >> SECTOR_SHIFT;
+               return 0;
+       }
+
+       if(S_ISBLK(st->st_mode)) {
+               fd = open(filename, O_RDONLY);
+               if (fd < 0)
+                       return -1;
+               if (blk_getimagesize(fd, size) != 0) {
+                       printf("Unable to get Block device size\n");
+                       close(fd);
+                       return -1;
+               }
+               close(fd);
+       } else *size = (st->st_size >> SECTOR_SHIFT);   
+       return 0;
+}
+
+static int qcow_set_key(struct tdqcow_state *s, const char *key)
+{
+       uint8_t keybuf[16];
+       int len, i;
+       
+       memset(keybuf, 0, 16);
+       len = strlen(key);
+       if (len > 16)
+               len = 16;
+       /* XXX: we could compress the chars to 7 bits to increase
+          entropy */
+       for (i = 0; i < len; i++) {
+               keybuf[i] = key[i];
+       }
+       s->crypt_method = s->crypt_method_header;
+       
+       if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
+               return -1;
+       if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
+               return -1;
+#if 0
+       /* test */
+       {
+               uint8_t in[16];
+               uint8_t out[16];
+               uint8_t tmp[16];
+               for (i=0; i<16; i++)
+                       in[i] = i;
+               AES_encrypt(in, tmp, &s->aes_encrypt_key);
+               AES_decrypt(tmp, out, &s->aes_decrypt_key);
+               for (i = 0; i < 16; i++)
+                       DPRINTF(" %02x", tmp[i]);
+               DPRINTF("\n");
+               for (i = 0; i < 16; i++)
+                       DPRINTF(" %02x", out[i]);
+               DPRINTF("\n");
+       }
+#endif
+       return 0;
+}
+
+void tdqcow_complete(void *arg, struct tiocb *tiocb, int err)
+{
+       struct qcow_request *aio = (struct qcow_request *)arg;
+       struct tdqcow_state *s = aio->state;
+
+       td_complete_request(aio->treq, err);
+
+       s->aio_free_list[s->aio_free_count++] = aio;
+}
+
+static void async_read(td_driver_t *driver, td_request_t treq)
+{
+       int size;
+       uint64_t offset;
+       struct qcow_request *aio;
+       struct tdqcow_state *prv;
+
+       prv    = (struct tdqcow_state *)driver->data;
+       size   = treq.secs * driver->info.sector_size;
+       offset = treq.sec  * (uint64_t)driver->info.sector_size;
+
+       if (prv->aio_free_count == 0)
+               goto fail;
+
+       aio        = prv->aio_free_list[--prv->aio_free_count];
+       aio->treq  = treq;
+       aio->state = prv;
+
+       td_prep_read(&aio->tiocb, prv->fd, treq.buf,
+                    size, offset, tdqcow_complete, aio);
+       td_queue_tiocb(driver, &aio->tiocb);
+
+       return;
+
+fail:
+       td_complete_request(treq, -EBUSY);
+}
+
+static void async_write(td_driver_t *driver, td_request_t treq)
+{
+       int size;
+       uint64_t offset;
+       struct qcow_request *aio;
+       struct tdqcow_state *prv;
+
+       prv     = (struct tdqcow_state *)driver->data;
+       size    = treq.secs * driver->info.sector_size;
+       offset  = treq.sec  * (uint64_t)driver->info.sector_size;
+
+       if (prv->aio_free_count == 0)
+               goto fail;
+
+       aio        = prv->aio_free_list[--prv->aio_free_count];
+       aio->treq  = treq;
+       aio->state = prv;
+
+       td_prep_write(&aio->tiocb, prv->fd, treq.buf,
+                     size, offset, tdqcow_complete, aio);
+       td_queue_tiocb(driver, &aio->tiocb);
+
+       return;
+
+fail:
+       td_complete_request(treq, -EBUSY);
+}
+
+/* 
+ * The crypt function is compatible with the linux cryptoloop
+ * algorithm for < 4 GB images. NOTE: out_buf == in_buf is
+ * supported .
+ */
+static void encrypt_sectors(struct tdqcow_state *s, int64_t sector_num,
+                            uint8_t *out_buf, const uint8_t *in_buf,
+                            int nb_sectors, int enc,
+                            const AES_KEY *key)
+{
+       union {
+               uint64_t ll[2];
+               uint8_t b[16];
+       } ivec;
+       int i;
+       
+       for (i = 0; i < nb_sectors; i++) {
+               ivec.ll[0] = cpu_to_le64(sector_num);
+               ivec.ll[1] = 0;
+               AES_cbc_encrypt(in_buf, out_buf, 512, key, 
+                               ivec.b, enc);
+               sector_num++;
+               in_buf += 512;
+               out_buf += 512;
+       }
+}
+
+int qtruncate(int fd, off_t length, int sparse)
+{
+       int ret, i; 
+       int current = 0, rem = 0;
+       uint64_t sectors;
+       struct stat st;
+       char *buf;
+
+       /* If length is greater than the current file len
+        * we synchronously write zeroes to the end of the 
+        * file, otherwise we truncate the length down
+        */
+       ret = fstat(fd, &st);
+       if (ret == -1) 
+               return -1;
+       if (S_ISBLK(st.st_mode))
+               return 0;
+
+       sectors = (length + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
+       current = (st.st_size + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
+       rem     = st.st_size % DEFAULT_SECTOR_SIZE;
+
+       /* If we are extending this file, we write zeros to the end --
+        * this tries to ensure that the extents allocated wind up being
+        * contiguous on disk.
+        */
+       if(st.st_size < sectors * DEFAULT_SECTOR_SIZE) {
+               /*We are extending the file*/
+               if ((ret = posix_memalign((void **)&buf, 
+                                         512, DEFAULT_SECTOR_SIZE))) {
+                       DPRINTF("posix_memalign failed: %d\n", ret);
+                       return -1;
+               }
+               memset(buf, 0x00, DEFAULT_SECTOR_SIZE);
+               if (lseek(fd, 0, SEEK_END)==-1) {
+                       DPRINTF("Lseek EOF failed (%d), internal error\n",
+                               errno);
+                       free(buf);
+                       return -1;
+               }
+               if (rem) {
+                       ret = write(fd, buf, rem);
+                       if (ret != rem) {
+                               DPRINTF("write failed: ret = %d, err = %s\n",
+                                       ret, strerror(errno));
+                               free(buf);
+                               return -1;
+                       }
+               }
+               for (i = current; i < sectors; i++ ) {
+                       ret = write(fd, buf, DEFAULT_SECTOR_SIZE);
+                       if (ret != DEFAULT_SECTOR_SIZE) {
+                               DPRINTF("write failed: ret = %d, err = %s\n",
+                                       ret, strerror(errno));
+                               free(buf);
+                               return -1;
+                       }
+               }
+               free(buf);
+       } else if(sparse && (st.st_size > sectors * DEFAULT_SECTOR_SIZE))
+               if (ftruncate(fd, (off_t)sectors * DEFAULT_SECTOR_SIZE)==-1) {
+                       DPRINTF("Ftruncate failed (%s)\n", strerror(errno));
+                       return -1;
+               }
+       return 0;
+}
+
+/* 'allocate' is:
+ *
+ * 0 to not allocate.
+ *
+ * 1 to allocate a normal cluster (for sector indexes 'n_start' to
+ * 'n_end')
+ *
+ * 2 to allocate a compressed cluster of size
+ * 'compressed_size'. 'compressed_size' must be > 0 and <
+ * cluster_size 
+ *
+ * return 0 if not allocated.
+ */
+static uint64_t get_cluster_offset(struct tdqcow_state *s,
+                                   uint64_t offset, int allocate,
+                                   int compressed_size,
+                                   int n_start, int n_end)
+{
+       int min_index, i, j, l1_index, l2_index, l2_sector, l1_sector;
+       char *tmp_ptr2, *l2_ptr, *l1_ptr;
+       uint64_t *tmp_ptr;
+       uint64_t l2_offset, *l2_table, cluster_offset, tmp;
+       uint32_t min_count;
+       int new_l2_table;
+
+       /*Check L1 table for the extent offset*/
+       l1_index = offset >> (s->l2_bits + s->cluster_bits);
+       l2_offset = s->l1_table[l1_index];
+       new_l2_table = 0;
+       if (!l2_offset) {
+               if (!allocate)
+                       return 0;
+               /* 
+                * allocating a new l2 entry + extent 
+                * at the end of the file, we must also
+                * update the L1 entry safely.
+                */
+               l2_offset = s->fd_end;
+
+               /* round to cluster size */
+               l2_offset = (l2_offset + s->cluster_size - 1) 
+                       & ~(s->cluster_size - 1);
+
+               /* update the L1 entry */
+               s->l1_table[l1_index] = l2_offset;
+               
+               /*Truncate file for L2 table 
+                *(initialised to zero in case we crash)*/
+               if (qtruncate(s->fd, 
+                             l2_offset + (s->l2_size * sizeof(uint64_t)),
+                             s->sparse) != 0) {
+                       DPRINTF("ERROR truncating file\n");
+                       return 0;
+               }
+               s->fd_end = l2_offset + (s->l2_size * sizeof(uint64_t));
+
+               /*Update the L1 table entry on disk
+                 * (for O_DIRECT we write 4KByte blocks)*/
+               l1_sector = (l1_index * sizeof(uint64_t)) >> 12;
+               l1_ptr = (char *)s->l1_table + (l1_sector << 12);
+
+               if (posix_memalign((void **)&tmp_ptr, 4096, 4096) != 0) {
+                       DPRINTF("ERROR allocating memory for L1 table\n");
+               }
+               memcpy(tmp_ptr, l1_ptr, 4096);
+
+               /* Convert block to write to big endian */
+               for(i = 0; i < 4096 / sizeof(uint64_t); i++) {
+                       cpu_to_be64s(&tmp_ptr[i]);
+               }
+
+               /*
+                * Issue non-asynchronous L1 write.
+                * For safety, we must ensure that
+                * entry is written before blocks.
+                */
+               lseek(s->fd, s->l1_table_offset + (l1_sector << 12), SEEK_SET);
+               if (write(s->fd, tmp_ptr, 4096) != 4096) {
+                       free(tmp_ptr);
+                       return 0;
+               }
+               free(tmp_ptr);
+
+               new_l2_table = 1;
+               goto cache_miss;
+       } else if (s->min_cluster_alloc == s->l2_size) {
+               /*Fast-track the request*/
+               cluster_offset = l2_offset + (s->l2_size * sizeof(uint64_t));
+               l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+               return cluster_offset + (l2_index * s->cluster_size);
+       }
+
+       /*Check to see if L2 entry is already cached*/
+       for (i = 0; i < L2_CACHE_SIZE; i++) {
+               if (l2_offset == s->l2_cache_offsets[i]) {
+                       /* increment the hit count */
+                       if (++s->l2_cache_counts[i] == 0xffffffff) {
+                               for (j = 0; j < L2_CACHE_SIZE; j++) {
+                                       s->l2_cache_counts[j] >>= 1;
+                               }
+                       }
+                       l2_table = s->l2_cache + (i << s->l2_bits);
+                       goto found;
+               }
+       }
+
+cache_miss:
+       /* not found: load a new entry in the least used one */
+       min_index = 0;
+       min_count = 0xffffffff;
+       for (i = 0; i < L2_CACHE_SIZE; i++) {
+               if (s->l2_cache_counts[i] < min_count) {
+                       min_count = s->l2_cache_counts[i];
+                       min_index = i;
+               }
+       }
+       l2_table = s->l2_cache + (min_index << s->l2_bits);
+
+       /*If extent pre-allocated, read table from disk, 
+        *otherwise write new table to disk*/
+       if (new_l2_table) {
+               /*Should we allocate the whole extent? Adjustable parameter.*/
+               if (s->cluster_alloc == s->l2_size) {
+                       cluster_offset = l2_offset + 
+                               (s->l2_size * sizeof(uint64_t));
+                       cluster_offset = (cluster_offset + s->cluster_size - 1)
+                               & ~(s->cluster_size - 1);
+                       if (qtruncate(s->fd, cluster_offset + 
+                                 (s->cluster_size * s->l2_size), 
+                                     s->sparse) != 0) {
+                               DPRINTF("ERROR truncating file\n");
+                               return 0;
+                       }
+                       s->fd_end = cluster_offset + 
+                               (s->cluster_size * s->l2_size);
+                       for (i = 0; i < s->l2_size; i++) {
+                               l2_table[i] = cpu_to_be64(cluster_offset + 
+                                                         (i*s->cluster_size));
+                       }  
+               } else memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
+
+               lseek(s->fd, l2_offset, SEEK_SET);
+               if (write(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
+                  s->l2_size * sizeof(uint64_t))
+                       return 0;
+       } else {
+               lseek(s->fd, l2_offset, SEEK_SET);
+               if (read(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) != 
+                   s->l2_size * sizeof(uint64_t))
+                       return 0;
+       }
+       
+       /*Update the cache entries*/ 
+       s->l2_cache_offsets[min_index] = l2_offset;
+       s->l2_cache_counts[min_index] = 1;
+
+found:
+       /*The extent is split into 's->l2_size' blocks of 
+        *size 's->cluster_size'*/
+       l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+       cluster_offset = be64_to_cpu(l2_table[l2_index]);
+
+       if (!cluster_offset || 
+           ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1) ) {
+               if (!allocate)
+                       return 0;
+               
+               if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
+                   (n_end - n_start) < s->cluster_sectors) {
+                       /* cluster is already allocated but compressed, we must
+                          decompress it in the case it is not completely
+                          overwritten */
+                       if (decompress_cluster(s, cluster_offset) < 0)
+                               return 0;
+                       cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
+                       cluster_offset = (cluster_offset + s->cluster_size - 1)
+                               & ~(s->cluster_size - 1);
+                       /* write the cluster content - not asynchronous */
+                       lseek(s->fd, cluster_offset, SEEK_SET);
+                       if (write(s->fd, s->cluster_cache, s->cluster_size) != 
+                           s->cluster_size)
+                           return -1;
+               } else {
+                       /* allocate a new cluster */
+                       cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
+                       if (allocate == 1) {
+                               /* round to cluster size */
+                               cluster_offset = 
+                                       (cluster_offset + s->cluster_size - 1) 
+                                       & ~(s->cluster_size - 1);
+                               if (qtruncate(s->fd, cluster_offset + 
+                                             s->cluster_size, s->sparse)!=0) {
+                                       DPRINTF("ERROR truncating file\n");
+                                       return 0;
+                               }
+                               s->fd_end = (cluster_offset + s->cluster_size);
+                               /* if encrypted, we must initialize the cluster
+                                  content which won't be written */
+                               if (s->crypt_method && 
+                                   (n_end - n_start) < s->cluster_sectors) {
+                                       uint64_t start_sect;
+                                       start_sect = (offset & 
+                                                     ~(s->cluster_size - 1)) 
+                                                             >> 9;
+                                       memset(s->cluster_data + 512, 
+                                              0xaa, 512);
+                                       for (i = 0; i < s->cluster_sectors;i++)
+                                       {
+                                               if (i < n_start || i >= n_end) 
+                                               {
+                                                       encrypt_sectors(s, 
start_sect + i, 
+                                                                       
s->cluster_data, 
+                                                                       
s->cluster_data + 512, 1, 1,
+                                                                       
&s->aes_encrypt_key);
+                                                       lseek(s->fd, 
cluster_offset + i * 512, SEEK_SET);
+                                                       if (write(s->fd, 
s->cluster_data, 512) != 512)
+                                                               return -1;
+                                               }
+                                       }
+                               }
+                       } else {
+                               cluster_offset |= QCOW_OFLAG_COMPRESSED | 
+                                       (uint64_t)compressed_size 
+                                               << (63 - s->cluster_bits);
+                       }
+               }
+               /* update L2 table */
+               tmp = cpu_to_be64(cluster_offset);
+               l2_table[l2_index] = tmp;
+
+               /*For IO_DIRECT we write 4KByte blocks*/
+               l2_sector = (l2_index * sizeof(uint64_t)) >> 12;
+               l2_ptr = (char *)l2_table + (l2_sector << 12);
+               
+               if (posix_memalign((void **)&tmp_ptr2, 4096, 4096) != 0) {
+                       DPRINTF("ERROR allocating memory for L1 table\n");
+               }
+               memcpy(tmp_ptr2, l2_ptr, 4096);
+               lseek(s->fd, l2_offset + (l2_sector << 12), SEEK_SET);
+               if (write(s->fd, tmp_ptr2, 4096) != 4096) {
+                       free(tmp_ptr2);
+                       return -1;
+               }
+               free(tmp_ptr2);
+       }
+       return cluster_offset;
+}
+
+static int qcow_is_allocated(struct tdqcow_state *s, int64_t sector_num,
+                             int nb_sectors, int *pnum)
+{
+       int index_in_cluster, n;
+       uint64_t cluster_offset;
+
+       cluster_offset = get_cluster_offset(s, sector_num << 9, 0, 0, 0, 0);
+       index_in_cluster = sector_num & (s->cluster_sectors - 1);
+       n = s->cluster_sectors - index_in_cluster;
+       if (n > nb_sectors)
+               n = nb_sectors;
+       *pnum = n;
+       return (cluster_offset != 0);
+}
+
+static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
+                             const uint8_t *buf, int buf_size)
+{
+       z_stream strm1, *strm = &strm1;
+       int ret, out_len;
+       
+       memset(strm, 0, sizeof(*strm));
+       
+       strm->next_in = (uint8_t *)buf;
+       strm->avail_in = buf_size;
+       strm->next_out = out_buf;
+       strm->avail_out = out_buf_size;
+       
+       ret = inflateInit2(strm, -12);
+       if (ret != Z_OK)
+               return -1;
+       ret = inflate(strm, Z_FINISH);
+       out_len = strm->next_out - out_buf;
+       if ( (ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+           (out_len != out_buf_size) ) {
+               inflateEnd(strm);
+               return -1;
+       }
+       inflateEnd(strm);
+       return 0;
+}
+                              
+static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset)
+{
+       int ret, csize;
+       uint64_t coffset;
+
+       coffset = cluster_offset & s->cluster_offset_mask;
+       if (s->cluster_cache_offset != coffset) {
+               csize = cluster_offset >> (63 - s->cluster_bits);
+               csize &= (s->cluster_size - 1);
+               lseek(s->fd, coffset, SEEK_SET);
+               ret = read(s->fd, s->cluster_data, csize);
+               if (ret != csize) 
+                       return -1;
+               if (decompress_buffer(s->cluster_cache, s->cluster_size,
+                                     s->cluster_data, csize) < 0) {
+                       return -1;
+               }
+               s->cluster_cache_offset = coffset;
+       }
+       return 0;
+}
+
+static int
+tdqcow_read_header(int fd, QCowHeader *header)
+{
+       int err;
+       char *buf;
+       struct stat st;
+       size_t size, expected;
+
+       memset(header, 0, sizeof(*header));
+
+       err = fstat(fd, &st);
+       if (err)
+               return -errno;
+
+       err = lseek(fd, 0, SEEK_SET);
+       if (err == (off_t)-1)
+               return -errno;
+
+       size = (sizeof(*header) + 511) & ~511;
+       err = posix_memalign((void **)&buf, 512, size);
+       if (err)
+               return err;
+
+       expected = size;
+       if (st.st_size < size)
+               expected = st.st_size;
+
+       errno = 0;
+       err = read(fd, buf, size);
+       if (err != expected) {
+               err = (errno ? -errno : -EIO);
+               goto out;
+       }
+
+       memcpy(header, buf, sizeof(*header));
+       be32_to_cpus(&header->magic);
+       be32_to_cpus(&header->version);
+       be64_to_cpus(&header->backing_file_offset);
+       be32_to_cpus(&header->backing_file_size);
+       be32_to_cpus(&header->mtime);
+       be64_to_cpus(&header->size);
+       be32_to_cpus(&header->crypt_method);
+       be64_to_cpus(&header->l1_table_offset);
+
+       err = 0;
+
+out:
+       free(buf);
+       return err;
+}
+
+static int
+tdqcow_load_l1_table(struct tdqcow_state *s, QCowHeader *header)
+{
+       char *buf;
+       struct stat st;
+       size_t expected;
+       int i, err, shift;
+       QCowHeader_ext *exthdr;
+       uint32_t l1_table_bytes, l1_table_block, l1_table_size;
+
+       buf         = NULL;
+       s->l1_table = NULL;
+
+       shift = s->cluster_bits + s->l2_bits;
+
+       s->l1_size = (header->size + (1LL << shift) - 1) >> shift;
+       s->l1_table_offset = header->l1_table_offset;
+
+       s->min_cluster_alloc = 1; /* default */
+
+       l1_table_bytes = s->l1_size * sizeof(uint64_t);
+       l1_table_size  = (l1_table_bytes + 4095) & ~4095;
+       l1_table_block = (l1_table_bytes + s->l1_table_offset + 4095) & ~4095;
+
+       DPRINTF("L1 Table offset detected: %"PRIu64", size %d (%d)\n",
+               (uint64_t)s->l1_table_offset,
+               (int) (s->l1_size * sizeof(uint64_t)), 
+               l1_table_size);
+
+       err = fstat(s->fd, &st);
+       if (err) {
+               err = -errno;
+               goto out;
+       }
+
+       err = lseek(s->fd, 0, SEEK_SET);
+       if (err == (off_t)-1) {
+               err = -errno;
+               goto out;
+       }
+
+       err = posix_memalign((void **)&buf, 512, l1_table_block);
+       if (err) {
+               buf = NULL;
+               goto out;
+       }
+
+       err = posix_memalign((void **)&s->l1_table, 4096, l1_table_size);
+       if (err) {
+               s->l1_table = NULL;
+               goto out;
+       }
+
+       memset(buf, 0, l1_table_block);
+       memset(s->l1_table, 0, l1_table_size);
+
+       expected = l1_table_block;
+       if (st.st_size < l1_table_block)
+               expected = st.st_size;
+
+       errno = 0;
+       err = read(s->fd, buf, l1_table_block);
+       if (err != expected) {
+               err = (errno ? -errno : -EIO);
+               goto out;
+       }
+
+       memcpy(s->l1_table, buf + s->l1_table_offset, l1_table_size);
+       exthdr = (QCowHeader_ext *)(buf + sizeof(QCowHeader));
+
+       /* check for xen extended header */
+       if (s->l1_table_offset % 4096 == 0 &&
+           be32_to_cpu(exthdr->xmagic) == XEN_MAGIC) {
+               uint32_t flags = be32_to_cpu(exthdr->flags);
+               uint32_t cksum = be32_to_cpu(exthdr->cksum);
+
+               /*
+                * Try to detect old tapdisk images. They have to be fixed
+                * because they use big endian rather than native endian for
+                * the L1 table.  After this block, the l1 table will
+                * definitely be in BIG endian.
+                */
+               if (!(flags & EXTHDR_L1_BIG_ENDIAN)) {
+                       DPRINTF("qcow: converting to big endian L1 table\n");
+
+                       /* convert to big endian */
+                       for (i = 0; i < s->l1_size; i++)
+                               cpu_to_be64s(&s->l1_table[i]);
+
+                       flags |= EXTHDR_L1_BIG_ENDIAN;
+                       exthdr->flags = cpu_to_be32(flags);
+
+                       memcpy(buf + s->l1_table_offset,
+                              s->l1_table, l1_table_size);
+                       
+                       err = lseek(s->fd, 0, SEEK_SET);
+                       if (err == (off_t)-1) {
+                               err = -errno;
+                               goto out;
+                       }
+
+                       err = atomicio(vwrite, s->fd, buf, l1_table_block);
+                       if (err != l1_table_block) {
+                               err = -errno;
+                               goto out;
+                       }
+               }
+
+               /* check the L1 table checksum */
+               if (cksum != gen_cksum((char *)s->l1_table,
+                                      s->l1_size * sizeof(uint64_t)))
+                       DPRINTF("qcow: bad L1 checksum\n");
+               else {
+                       s->extended = 1;
+                       s->sparse = (be32_to_cpu(exthdr->flags) & SPARSE_FILE);
+                       s->min_cluster_alloc =
+                               be32_to_cpu(exthdr->min_cluster_alloc);
+               }
+       }
+
+       /* convert L1 table to native endian for operation */
+       for (i = 0; i < s->l1_size; i++)
+               be64_to_cpus(&s->l1_table[i]);
+
+       err = 0;
+
+out:
+       if (err) {
+               free(buf);
+               free(s->l1_table);
+               s->l1_table = NULL;
+       }
+       return err;
+}
+
+/* Open the disk file and initialize qcow state. */
+int tdqcow_open (td_driver_t *driver, const char *name, td_flag_t flags)
+{
+       int fd, len, i, ret, size, o_flags;
+       td_disk_info_t *bs = &(driver->info);
+       struct tdqcow_state   *s  = (struct tdqcow_state *)driver->data;
+       QCowHeader header;
+       uint64_t final_cluster = 0;
+
+       DPRINTF("QCOW: Opening %s\n", name);
+
+       o_flags = O_DIRECT | O_LARGEFILE | 
+               ((flags == TD_OPEN_RDONLY) ? O_RDONLY : O_RDWR);
+       fd = open(name, o_flags);
+       if (fd < 0) {
+               DPRINTF("Unable to open %s (%d)\n", name, -errno);
+               return -1;
+       }
+
+       s->fd = fd;
+       s->name = strdup(name);
+       if (!s->name)
+               goto fail;
+
+       if (tdqcow_read_header(fd, &header))
+               goto fail;
+
+       if (header.magic != QCOW_MAGIC)
+               goto fail;
+
+       switch (header.version) {
+       case QCOW_VERSION:
+               break;
+       case 2:
+         //TODO: Port qcow2 to new blktap framework.
+         //            close(fd);
+         //            dd->drv = &tapdisk_qcow2;
+         //            return dd->drv->td_open(dd, name, flags);
+         goto fail;
+       default:
+               goto fail;
+       }
+
+       if (header.size <= 1 || header.cluster_bits < 9)
+               goto fail;
+       if (header.crypt_method > QCOW_CRYPT_AES)
+               goto fail;
+       s->crypt_method_header = header.crypt_method;
+       if (s->crypt_method_header)
+               s->encrypted = 1;
+       s->cluster_bits = header.cluster_bits;
+       s->cluster_size = 1 << s->cluster_bits;
+       s->cluster_sectors = 1 << (s->cluster_bits - 9);
+       s->l2_bits = header.l2_bits;
+       s->l2_size = 1 << s->l2_bits;
+       s->cluster_alloc = s->l2_size;
+       bs->size = header.size / 512;
+       s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
+       s->backing_file_offset = header.backing_file_offset;
+       s->backing_file_size   = header.backing_file_size;
+
+       /* allocate and load l1 table */
+       if (tdqcow_load_l1_table(s, &header))
+               goto fail;
+
+       /* alloc L2 cache */
+       size = s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t);
+       ret = posix_memalign((void **)&s->l2_cache, 4096, size);
+       if(ret != 0) goto fail;
+
+       size = s->cluster_size;
+       ret = posix_memalign((void **)&s->cluster_cache, 4096, size);
+       if(ret != 0) goto fail;
+
+       ret = posix_memalign((void **)&s->cluster_data, 4096, size);
+       if(ret != 0) goto fail;
+       s->cluster_cache_offset = -1;
+
+       if (s->backing_file_offset != 0)
+               s->cluster_alloc = 1; /*Cannot use pre-alloc*/
+
+        bs->sector_size = 512;
+        bs->info = 0;
+
+       for(i = 0; i < s->l1_size; i++)
+               if (s->l1_table[i] > final_cluster)
+                       final_cluster = s->l1_table[i];
+
+       if (init_aio_state(driver)!=0) {
+         DPRINTF("Unable to initialise AIO state\n");
+         free_aio_state(s);
+         goto fail;
+       }
+
+       if (!final_cluster)
+               s->fd_end = s->l1_table_offset +
+                       ((s->l1_size * sizeof(uint64_t) + 4095) & ~4095);
+       else {
+               s->fd_end = lseek64(fd, 0, SEEK_END);
+               if (s->fd_end == (off64_t)-1)
+                       goto fail;
+       }
+
+       return 0;
+       
+fail:
+       DPRINTF("QCOW Open failed\n");
+
+       free_aio_state(s);
+       free(s->l1_table);
+       free(s->l2_cache);
+       free(s->cluster_cache);
+       free(s->cluster_data);
+       close(fd);
+       return -1;
+}
+
+void tdqcow_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       struct tdqcow_state   *s  = (struct tdqcow_state *)driver->data;
+       int ret = 0, index_in_cluster, n, i;
+       uint64_t cluster_offset, sector, nb_sectors;
+       struct qcow_prv* prv;
+       td_request_t clone = treq;
+       char* buf = treq.buf;
+
+       sector     = treq.sec;
+       nb_sectors = treq.secs;
+
+       /*We store a local record of the request*/
+       while (nb_sectors > 0) {
+               cluster_offset = 
+                       get_cluster_offset(s, sector << 9, 0, 0, 0, 0);
+               index_in_cluster = sector & (s->cluster_sectors - 1);
+               n = s->cluster_sectors - index_in_cluster;
+               if (n > nb_sectors)
+                       n = nb_sectors;
+
+               if (s->aio_free_count == 0) {
+                       td_complete_request(treq, -EBUSY);
+                       return;
+               }
+               
+               if(!cluster_offset) {
+                       treq.buf  = buf;
+                       treq.sec  = sector;
+                       treq.secs = n;
+                       td_forward_request(treq);
+
+               } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
+                       if (decompress_cluster(s, cluster_offset) < 0) {
+                               td_complete_request(treq, -EIO);
+                               goto done;
+                       }
+                       memcpy(buf, s->cluster_cache + index_in_cluster * 512, 
+                              512 * n);
+                       
+                       treq.buf  = buf;
+                       treq.sec  = sector;
+                       treq.secs = n;
+                       td_complete_request(treq, 0);
+               } else {
+                 clone.buf  = buf;
+                 clone.sec  = (cluster_offset>>9)+index_in_cluster;
+                 clone.secs = n;
+                 async_read(driver, clone);
+               }
+               nb_sectors -= n;
+               sector += n;
+               buf += n * 512;
+       }
+done:
+       return;
+}
+
+void tdqcow_queue_write(td_driver_t *driver, td_request_t treq)
+{
+       struct tdqcow_state   *s  = (struct tdqcow_state *)driver->data;
+       int ret = 0, index_in_cluster, n, i;
+       uint64_t cluster_offset, sector, nb_sectors;
+       td_callback_t cb;
+       struct qcow_prv* prv;
+       char* buf = treq.buf;
+       td_request_t clone=treq;
+
+       sector     = treq.sec;
+       nb_sectors = treq.secs;
+                  
+       /*We store a local record of the request*/
+       while (nb_sectors > 0) {
+               index_in_cluster = sector & (s->cluster_sectors - 1);
+               n = s->cluster_sectors - index_in_cluster;
+               if (n > nb_sectors)
+                       n = nb_sectors;
+
+               if (s->aio_free_count == 0) {
+                       td_complete_request(treq, -EBUSY);
+                       return;
+               }
+
+               cluster_offset = get_cluster_offset(s, sector << 9, 1, 0,
+                                                   index_in_cluster, 
+                                                   index_in_cluster+n);
+               if (!cluster_offset) {
+                       DPRINTF("Ooops, no write cluster offset!\n");
+                       td_complete_request(treq, -EIO);
+                       return;
+               }
+
+               if (s->crypt_method) {
+                       encrypt_sectors(s, sector, s->cluster_data, 
+                                       (unsigned char *)buf, n, 1,
+                                       &s->aes_encrypt_key);
+
+                       clone.buf  = buf;
+                       clone.sec  = (cluster_offset>>9) + index_in_cluster;
+                       clone.secs = n;
+                       async_write(driver, clone);
+               } else {
+                 clone.buf  = buf;
+                 clone.sec  = (cluster_offset>>9) + index_in_cluster;
+                 clone.secs = n;
+
+                 async_write(driver, clone);
+               }
+               
+               nb_sectors -= n;
+               sector += n;
+               buf += n * 512;
+       }
+       s->cluster_cache_offset = -1; /* disable compressed cache */
+
+       return;
+}
+
+static int
+tdqcow_update_checksum(struct tdqcow_state *s)
+{
+       int i, fd, err;
+       uint32_t offset, cksum, out;
+
+       if (!s->extended)
+               return 0;
+
+       fd = open(s->name, O_WRONLY | O_LARGEFILE); /* open without O_DIRECT */
+       if (fd == -1) {
+               err = errno;
+               goto out;
+       }
+
+       offset = sizeof(QCowHeader) + offsetof(QCowHeader_ext, cksum);
+       if (lseek(fd, offset, SEEK_SET) == (off_t)-1) {
+               err = errno;
+               goto out;
+       }
+
+       /* convert to big endian for checksum */
+       for (i = 0; i < s->l1_size; i++)
+               cpu_to_be64s(&s->l1_table[i]);
+
+       cksum = gen_cksum((char *)s->l1_table, s->l1_size * sizeof(uint64_t));
+
+       /* and back again... */
+       for (i = 0; i < s->l1_size; i++)
+               be64_to_cpus(&s->l1_table[i]);
+
+       DPRINTF("Writing cksum: %d", cksum);
+
+       out = cpu_to_be32(cksum);
+       if (write(fd, &out, sizeof(out)) != sizeof(out)) {
+               err = errno;
+               goto out;
+       }
+
+       err = 0;
+
+out:
+       if (err)
+               DPRINTF("failed to update checksum: %d\n", err);
+       if (fd != -1)
+               close(fd);
+       return err;
+}
+               
+int tdqcow_close(td_driver_t *driver)
+{
+       struct tdqcow_state *s = (struct tdqcow_state *)driver->data;
+
+       /*Update the hdr cksum*/
+       tdqcow_update_checksum(s);
+
+       free_aio_state(s);
+       free(s->name);
+       free(s->l1_table);
+       free(s->l2_cache);
+       free(s->cluster_cache);
+       free(s->cluster_data);
+       close(s->fd);   
+       return 0;
+}
+
+int qcow_create(const char *filename, uint64_t total_size,
+               const char *backing_file, int sparse)
+{
+       int fd, header_size, backing_filename_len, l1_size, i;
+       int shift, length, adjust, flags = 0, ret = 0;
+       QCowHeader header;
+       QCowHeader_ext exthdr;
+       char backing_filename[PATH_MAX], *ptr;
+       uint64_t tmp, size, total_length;
+       struct stat st;
+
+       DPRINTF("Qcow_create: size %"PRIu64"\n",total_size);
+
+       fd = open(filename, 
+                 O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
+                 0644);
+       if (fd < 0)
+               return -1;
+
+       memset(&header, 0, sizeof(header));
+       header.magic = cpu_to_be32(QCOW_MAGIC);
+       header.version = cpu_to_be32(QCOW_VERSION);
+
+       /*Create extended header fields*/
+       exthdr.xmagic = cpu_to_be32(XEN_MAGIC);
+
+       header_size = sizeof(header) + sizeof(QCowHeader_ext);
+       backing_filename_len = 0;
+       size = (total_size >> SECTOR_SHIFT);
+       if (backing_file) {
+               if (strcmp(backing_file, "fat:")) {
+                       const char *p;
+                       /* XXX: this is a hack: we do not attempt to 
+                        *check for URL like syntax */
+                       p = strchr(backing_file, ':');
+                       if (p && (p - backing_file) >= 2) {
+                               /* URL like but exclude "c:" like filenames */
+                               strncpy(backing_filename, backing_file,
+                                       sizeof(backing_filename));
+                       } else {
+                               if (realpath(backing_file, backing_filename) == 
NULL ||
+                                   stat(backing_filename, &st) != 0) {
+                                       return -1;
+                               }
+                       }
+                       header.backing_file_offset = cpu_to_be64(header_size);
+                       backing_filename_len = strlen(backing_filename);
+                       header.backing_file_size = cpu_to_be32(
+                               backing_filename_len);
+                       header_size += backing_filename_len;
+                       
+                       /*Set to the backing file size*/
+                       if(get_filesize(backing_filename, &size, &st)) {
+                               return -1;
+                       }
+                       DPRINTF("Backing file size detected: %"PRId64" sectors" 
+                               "(total %"PRId64" [%"PRId64" MB])\n", 
+                               size, 
+                               (uint64_t)(size << SECTOR_SHIFT), 
+                               (uint64_t)(size >> 11));
+               } else {
+                       backing_file = NULL;
+                       DPRINTF("Setting file size: %"PRId64" (total 
%"PRId64")\n", 
+                               total_size, 
+                               (uint64_t) (total_size << SECTOR_SHIFT));
+               }
+               header.mtime = cpu_to_be32(st.st_mtime);
+               header.cluster_bits = 9; /* 512 byte cluster to avoid copying
+                                           unmodifyed sectors */
+               header.l2_bits = 12; /* 32 KB L2 tables */
+               exthdr.min_cluster_alloc = cpu_to_be32(1);
+       } else {
+               DPRINTF("Setting file size: %"PRId64" sectors" 
+                       "(total %"PRId64" [%"PRId64" MB])\n", 
+                       size, 
+                       (uint64_t) (size << SECTOR_SHIFT), 
+                       (uint64_t) (size >> 11));
+               header.cluster_bits = 12; /* 4 KB clusters */
+               header.l2_bits = 9; /* 4 KB L2 tables */
+               exthdr.min_cluster_alloc = cpu_to_be32(1 << 9);
+       }
+       /*Set the header size value*/
+       header.size = cpu_to_be64(size * 512);
+       
+       header_size = (header_size + 7) & ~7;
+       if (header_size % 4096 > 0) {
+               header_size = ((header_size >> 12) + 1) << 12;
+       }
+
+       shift = header.cluster_bits + header.l2_bits;
+       l1_size = ((size * 512) + (1LL << shift) - 1) >> shift;
+
+       header.l1_table_offset = cpu_to_be64(header_size);
+       DPRINTF("L1 Table offset: %d, size %d\n",
+               header_size,
+               (int)(l1_size * sizeof(uint64_t)));
+       header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
+
+       ptr = calloc(1, l1_size * sizeof(uint64_t));
+       exthdr.cksum = cpu_to_be32(gen_cksum(ptr, l1_size * sizeof(uint64_t)));
+       printf("Created cksum: %d\n",exthdr.cksum);
+       free(ptr);
+
+       /*adjust file length to system page size boundary*/
+       length = ROUNDUP(header_size + (l1_size * sizeof(uint64_t)),
+               getpagesize());
+       if (qtruncate(fd, length, 0)!=0) {
+               DPRINTF("ERROR truncating file\n");
+               return -1;
+       }
+
+       if (sparse == 0) {
+               /*Filesize is length+l1_size*(1 << s->l2_bits)+(size*512)*/
+               total_length = length + (l1_size * (1 << 9)) + (size * 512);
+               if (qtruncate(fd, total_length, 0)!=0) {
+                        DPRINTF("ERROR truncating file\n");
+                        return -1;
+               }
+               printf("File truncated to length %"PRIu64"\n",total_length);
+       } else
+               flags = SPARSE_FILE;
+
+       flags |= EXTHDR_L1_BIG_ENDIAN;
+       exthdr.flags = cpu_to_be32(flags);
+       
+       /* write all the data */
+       lseek(fd, 0, SEEK_SET);
+       ret += write(fd, &header, sizeof(header));
+       ret += write(fd, &exthdr, sizeof(exthdr));
+       if (backing_file)
+               ret += write(fd, backing_filename, backing_filename_len);
+
+       lseek(fd, header_size, SEEK_SET);
+       tmp = 0;
+       for (i = 0;i < l1_size; i++) {
+               ret += write(fd, &tmp, sizeof(tmp));
+       }
+
+       close(fd);
+
+       return 0;
+}
+
+static int qcow_make_empty(struct tdqcow_state *s)
+{
+       uint32_t l1_length = s->l1_size * sizeof(uint64_t);
+
+       memset(s->l1_table, 0, l1_length);
+       lseek(s->fd, s->l1_table_offset, SEEK_SET);
+       if (write(s->fd, s->l1_table, l1_length) < 0)
+               return -1;
+       if (qtruncate(s->fd, s->l1_table_offset + l1_length, s->sparse)!=0) {
+               DPRINTF("ERROR truncating file\n");
+               return -1;
+       }
+
+       memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+       memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
+       memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
+
+       return 0;
+}
+
+static int qcow_get_cluster_size(struct tdqcow_state *s)
+{
+       return s->cluster_size;
+}
+
+/* XXX: put compressed sectors first, then all the cluster aligned
+   tables to avoid losing bytes in alignment */
+static int qcow_compress_cluster(struct tdqcow_state *s, int64_t sector_num, 
+                          const uint8_t *buf)
+{
+       z_stream strm;
+       int ret, out_len;
+       uint8_t *out_buf;
+       uint64_t cluster_offset;
+
+       out_buf = malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
+       if (!out_buf)
+               return -1;
+
+       /* best compression, small window, no zlib header */
+       memset(&strm, 0, sizeof(strm));
+       ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+                          Z_DEFLATED, -12, 
+                          9, Z_DEFAULT_STRATEGY);
+       if (ret != 0) {
+               free(out_buf);
+               return -1;
+       }
+
+       strm.avail_in = s->cluster_size;
+       strm.next_in = (uint8_t *)buf;
+       strm.avail_out = s->cluster_size;
+       strm.next_out = out_buf;
+
+       ret = deflate(&strm, Z_FINISH);
+       if (ret != Z_STREAM_END && ret != Z_OK) {
+               free(out_buf);
+               deflateEnd(&strm);
+               return -1;
+       }
+       out_len = strm.next_out - out_buf;
+
+       deflateEnd(&strm);
+
+       if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
+               /* could not compress: write normal cluster */
+               //tdqcow_queue_write(bs, sector_num, buf, s->cluster_sectors);
+       } else {
+               cluster_offset = get_cluster_offset(s, sector_num << 9, 2, 
+                                            out_len, 0, 0);
+               cluster_offset &= s->cluster_offset_mask;
+               lseek(s->fd, cluster_offset, SEEK_SET);
+               if (write(s->fd, out_buf, out_len) != out_len) {
+                       free(out_buf);
+                       return -1;
+               }
+       }
+       
+       free(out_buf);
+       return 0;
+}
+
+static int
+tdqcow_get_image_type(const char *file, int *type)
+{
+       int fd;
+       size_t size;
+       QCowHeader header;
+
+       fd = open(file, O_RDONLY);
+       if (fd == -1)
+               return -errno;
+
+       size = read(fd, &header, sizeof(header));
+       close(fd);
+       if (size != sizeof(header))
+               return (errno ? -errno : -EIO);
+
+       be32_to_cpus(&header.magic);
+       if (header.magic == QCOW_MAGIC)
+               *type = DISK_TYPE_QCOW;
+       else
+               *type = DISK_TYPE_AIO;
+
+       return 0;
+}
+
+int tdqcow_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+       off_t off;
+       char *buf, *filename;
+       int len, secs, type, err = -EINVAL;
+       struct tdqcow_state *child  = (struct tdqcow_state *)driver->data;
+
+       if (!child->backing_file_offset)
+               return TD_NO_PARENT;
+
+       /* read the backing file name */
+       len  = child->backing_file_size;
+       off  = child->backing_file_offset - (child->backing_file_offset % 512);
+       secs = (len + (child->backing_file_offset - off) + 511) >> 9;
+
+       if (posix_memalign((void **)&buf, 512, secs << 9)) 
+               return -1;
+
+       if (lseek(child->fd, off, SEEK_SET) == (off_t)-1)
+               goto out;
+
+       if (read(child->fd, buf, secs << 9) != secs << 9)
+               goto out;
+       filename       = buf + (child->backing_file_offset - off);
+       filename[len]  = '\0';
+
+       if (tdqcow_get_image_type(filename, &type))
+               goto out;
+
+       id->name       = strdup(filename);
+       id->drivertype = type;
+       err            = 0;
+ out:
+       free(buf);
+       return err;
+}
+
+int tdqcow_validate_parent(td_driver_t *driver,
+                         td_driver_t *pdriver, td_flag_t flags)
+{
+       struct stat stats;
+       uint64_t psize, csize;
+       struct tdqcow_state *c = (struct tdqcow_state *)driver->data;
+       struct tdqcow_state *p = (struct tdqcow_state *)pdriver->data;
+       
+       if (stat(p->name, &stats))
+               return -EINVAL;
+       if (get_filesize(p->name, &psize, &stats))
+               return -EINVAL;
+
+       if (stat(c->name, &stats))
+               return -EINVAL;
+       if (get_filesize(c->name, &csize, &stats))
+               return -EINVAL;
+
+       if (csize != psize)
+               return -EINVAL;
+
+       return 0;
+}
+
+struct tap_disk tapdisk_qcow = {
+       .disk_type           = "tapdisk_qcow",
+       .flags              = 0,
+       .private_data_size   = sizeof(struct tdqcow_state),
+       .td_open             = tdqcow_open,
+       .td_close            = tdqcow_close,
+       .td_queue_read       = tdqcow_queue_read,
+       .td_queue_write      = tdqcow_queue_write,
+       .td_get_parent_id    = tdqcow_get_parent_id,
+       .td_validate_parent  = tdqcow_validate_parent,
+       .td_debug           = NULL,
+};
diff -r f210a633571c -r 1c627434605e tools/blktap2/drivers/block-ram.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap2/drivers/block-ram.c Tue May 26 11:52:31 2009 +0100
@@ -0,0 +1,269 @@
+/* 
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <string.h>
+
+#include "tapdisk.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+
+char *img;
+long int   disksector_size;
+long int   disksize;
+long int   diskinfo;
+static int connections = 0;
+
+struct tdram_state {
+        int fd;
+};
+
+/*Get Image size, secsize*/
+static int get_image_info(int fd, td_disk_info_t *info)
+{
+       int ret;
+       long size;
+       unsigned long total_size;
+       struct statvfs statBuf;
+       struct stat stat;
+
+       ret = fstat(fd, &stat);
+       if (ret != 0) {
+               DPRINTF("ERROR: fstat failed, Couldn't stat image");
+               return -EINVAL;
+       }
+
+       if (S_ISBLK(stat.st_mode)) {
+               /*Accessing block device directly*/
+               info->size = 0;
+               if (ioctl(fd,BLKGETSIZE,&info->size)!=0) {
+                       DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image");
+                       return -EINVAL;
+               }
+
+               DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(info->size << SECTOR_SHIFT),
+                       (long long unsigned)info->size);
+
+               /*Get the sector size*/
+#if defined(BLKSSZGET)
+               {
+                       int arg;
+                       info->sector_size = DEFAULT_SECTOR_SIZE;
+                       ioctl(fd, BLKSSZGET, &info->sector_size);
+                       
+                       if (info->sector_size != DEFAULT_SECTOR_SIZE)
+                               DPRINTF("Note: sector size is %ld (not %d)\n",
+                                       info->sector_size, DEFAULT_SECTOR_SIZE);
+               }
+#else
+               info->sector_size = DEFAULT_SECTOR_SIZE;
+#endif
+
+       } else {
+               /*Local file? try fstat instead*/
+               info->size = (stat.st_size >> SECTOR_SHIFT);
+               info->sector_size = DEFAULT_SECTOR_SIZE;
+               DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(info->size << SECTOR_SHIFT),
+                       (long long unsigned)info->size);
+       }
+
+       if (info->size == 0) {          
+               info->size =((uint64_t) MAX_RAMDISK_SIZE);
+               info->sector_size = DEFAULT_SECTOR_SIZE;
+       }
+       info->info = 0;
+
+        /*Store variables locally*/
+       disksector_size = info->sector_size;
+       disksize        = info->size;
+       diskinfo        = info->info;
+       DPRINTF("Image sector_size: \n\t[%lu]\n",
+               info->sector_size);
+
+       return 0;
+}
+
+/* Open the disk file and initialize ram state. */
+int tdram_open (td_driver_t *driver, const char *name, td_flag_t flags)
+{
+       char *p;
+       uint64_t size;
+       int i, fd, ret = 0, count = 0, o_flags;
+       struct tdram_state *prv = (struct tdram_state *)driver->data;
+
+       connections++;
+
+       if (connections > 1) {
+               driver->info.sector_size = disksector_size;
+               driver->info.size        = disksize;
+               driver->info.info        = diskinfo; 
+               DPRINTF("Image already open, returning parameters:\n");
+               DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(driver->info.size << SECTOR_SHIFT),
+                       (long long unsigned)driver->info.size);
+               DPRINTF("Image sector_size: \n\t[%lu]\n",
+                       driver->info.sector_size);
+
+               prv->fd = -1;
+               goto done;
+       }
+
+       /* Open the file */
+       o_flags = O_DIRECT | O_LARGEFILE | 
+               ((flags == TD_OPEN_RDONLY) ? O_RDONLY : O_RDWR);
+        fd = open(name, o_flags);
+
+        if ((fd == -1) && (errno == EINVAL)) {
+
+                /* Maybe O_DIRECT isn't supported. */
+               o_flags &= ~O_DIRECT;
+                fd = open(name, o_flags);
+                if (fd != -1) DPRINTF("WARNING: Accessing image without"
+                                     "O_DIRECT! (%s)\n", name);
+
+        } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name);
+       
+        if (fd == -1) {
+               DPRINTF("Unable to open [%s]!\n",name);
+               ret = 0 - errno;
+               goto done;
+        }
+
+        prv->fd = fd;
+
+       ret = get_image_info(fd, &driver->info);
+       size = MAX_RAMDISK_SIZE;
+
+       if (driver->info.size > size) {
+               DPRINTF("Disk exceeds limit, must be less than [%d]MB",
+                       (MAX_RAMDISK_SIZE<<SECTOR_SHIFT)>>20);
+               return -ENOMEM;
+       }
+
+       /*Read the image into memory*/
+       if (posix_memalign((void **)&img, 
+                          DEFAULT_SECTOR_SIZE,
+                          driver->info.size << SECTOR_SHIFT)) {
+               DPRINTF("Mem malloc failed\n");
+               return -errno;
+       }
+       p = img;
+       DPRINTF("Reading %llu bytes.......",
+               (long long unsigned)driver->info.size << SECTOR_SHIFT);
+
+       for (i = 0; i < driver->info.size; i++) {
+               ret = read(prv->fd, p, driver->info.sector_size);
+               if (ret != driver->info.sector_size) {
+                       DPRINTF("ret = %d, errno = %d\n", ret, errno);
+                       ret = 0 - errno;
+                       break;
+               } else {
+                       count += ret;
+                       p = img + count;
+               }
+       }
+       DPRINTF("[%d]\n",count);
+       if (count != driver->info.size << SECTOR_SHIFT) {
+               ret = -1;
+       } else {
+               ret = 0;
+       }
+
+done:
+       return ret;
+}
+
+void tdram_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       struct tdram_state *prv = (struct tdram_state *)driver->data;
+       int      size    = treq.secs * driver->info.sector_size;
+       uint64_t offset  = treq.sec * (uint64_t)driver->info.sector_size;
+
+       memcpy(treq.buf, img + offset, size);
+
+       td_complete_request(treq, 0);
+}
+
+void tdram_queue_write(td_driver_t *driver, td_request_t treq)
+{
+       struct tdram_state *prv = (struct tdram_state *)driver->data;
+       int      size    = treq.secs * driver->info.sector_size;
+       uint64_t offset  = treq.sec * (uint64_t)driver->info.sector_size;
+       
+       /* We assume that write access is controlled
+        * at a higher level for multiple disks */
+       memcpy(img + offset, treq.buf, size);
+
+       td_complete_request(treq, 0);
+}
+
+int tdram_close(td_driver_t *driver)
+{
+       struct tdram_state *prv = (struct tdram_state *)driver->data;
+       
+       connections--;
+       
+       return 0;
+}
+
+int tdram_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+       return TD_NO_PARENT;
+}
+
+int tdram_validate_parent(td_driver_t *driver,
+                         td_driver_t *pdriver, td_flag_t flags)
+{
+       return -EINVAL;
+}
+
+struct tap_disk tapdisk_ram = {
+       .disk_type          = "tapdisk_ram",
+       .flags              = 0,
+       .private_data_size  = sizeof(struct tdram_state),
+       .td_open            = tdram_open,
+       .td_close           = tdram_close,
+       .td_queue_read      = tdram_queue_read,
+       .td_queue_write     = tdram_queue_write,
+       .td_get_parent_id   = tdram_get_parent_id,
+       .td_validate_parent = tdram_validate_parent,
+       .td_debug           = NULL,
+};
diff -r f210a633571c -r 1c627434605e tools/blktap2/drivers/block-vhd.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap2/drivers/block-vhd.c Tue May 26 11:52:31 2009 +0100
@@ -0,0 +1,2321 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * A note on write transactions:
+ * Writes that require updating the BAT or bitmaps cannot be signaled
+ * as complete until all updates have reached disk.  Transactions are
+ * used to ensure proper ordering in these cases.  The two types of
+ * transactions are as follows:
+ *   - Bitmap updates only: data writes that require updates to the same
+ *     bitmap are grouped in a transaction.  Only after all data writes
+ *     in a transaction complete does the bitmap write commence.  Only
+ *     after the bitmap write finishes are the data writes signalled as
+ *     complete.
+ *   - BAT and bitmap updates: data writes are grouped in transactions
+ *     as above, but a special extra write is included in the transaction,
+ *     which zeros out the newly allocated bitmap on disk.  When the data
+ *     writes and the zero-bitmap write complete, the BAT and bitmap writes
+ *     are started in parallel.  The transaction is completed only after both
+ *     the BAT and bitmap writes successfully return.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <uuid/uuid.h> /* For whatever reason, Linux packages this in */
+                       /* e2fsprogs-devel.                            */
+#include <string.h>    /* for memset.                                 */
+#include <libaio.h>
+#include <sys/mman.h>
+
+#include "libvhd.h"
+#include "tapdisk.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+
+unsigned int SPB;
+
+#define DEBUGGING   2
+#define ASSERTING   1
+#define MICROSOFT_COMPAT
+
+#define VHD_BATMAP_MAX_RETRIES 10
+
+#define __TRACE(s)                                                     \
+       do {                                                            \
+               DBG(TLOG_DBG, "%s: QUEUED: %" PRIu64 ", COMPLETED: %"   \
+                   PRIu64", RETURNED: %" PRIu64 ", DATA_ALLOCATED: "   \
+                   "%lu, BBLK: 0x%04x\n",                              \
+                   s->vhd.file, s->queued, s->completed, s->returned,  \
+                   VHD_REQS_DATA - s->vreq_free_count,                 \
+                   s->bat.pbw_blk);                                    \
+       } while(0)
+
+#define __ASSERT(_p)                                                   \
+       if (!(_p)) {                                                    \
+               DPRINTF("%s:%d: FAILED ASSERTION: '%s'\n",              \
+                       __FILE__, __LINE__, #_p);                       \
+               DBG(TLOG_WARN, "%s:%d: FAILED ASSERTION: '%s'\n",       \
+                   __FILE__, __LINE__, #_p);                           \
+               tlog_flush();                                           \
+               *(int*)0 = 0;                                           \
+       }
+
+#if (DEBUGGING == 1)
+  #define DBG(level, _f, _a...)      DPRINTF(_f, ##_a)
+  #define ERR(err, _f, _a...)        DPRINTF("ERROR: %d: " _f, err, ##_a)
+  #define TRACE(s)                   ((void)0)
+#elif (DEBUGGING == 2)
+  #define DBG(level, _f, _a...)      tlog_write(level, _f, ##_a)
+  #define ERR(err, _f, _a...)       tlog_error(err, _f, ##_a)
+  #define TRACE(s)                   __TRACE(s)
+#else
+  #define DBG(level, _f, _a...)      ((void)0)
+  #define ERR(err, _f, _a...)        ((void)0)
+  #define TRACE(s)                   ((void)0)
+#endif
+
+#if (ASSERTING == 1)
+  #define ASSERT(_p)                 __ASSERT(_p)
+#else
+  #define ASSERT(_p)                 ((void)0)
+#endif
+
+/******VHD DEFINES******/
+#define VHD_CACHE_SIZE               32
+
+#define VHD_REQS_DATA                TAPDISK_DATA_REQUESTS
+#define VHD_REQS_META                (VHD_CACHE_SIZE + 2)
+#define VHD_REQS_TOTAL               (VHD_REQS_DATA + VHD_REQS_META)
+
+#define VHD_OP_BAT_WRITE             0
+#define VHD_OP_DATA_READ             1
+#define VHD_OP_DATA_WRITE            2
+#define VHD_OP_BITMAP_READ           3
+#define VHD_OP_BITMAP_WRITE          4
+#define VHD_OP_ZERO_BM_WRITE         5
+
+#define VHD_BM_BAT_LOCKED            0
+#define VHD_BM_BAT_CLEAR             1
+#define VHD_BM_BIT_CLEAR             2
+#define VHD_BM_BIT_SET               3
+#define VHD_BM_NOT_CACHED            4
+#define VHD_BM_READ_PENDING          5
+
+#define VHD_FLAG_OPEN_RDONLY         1
+#define VHD_FLAG_OPEN_NO_CACHE       2
+#define VHD_FLAG_OPEN_QUIET          4
+#define VHD_FLAG_OPEN_STRICT         8
+#define VHD_FLAG_OPEN_QUERY          16
+#define VHD_FLAG_OPEN_PREALLOCATE    32
+
+#define VHD_FLAG_BAT_LOCKED          1
+#define VHD_FLAG_BAT_WRITE_STARTED   2
+
+#define VHD_FLAG_BM_UPDATE_BAT       1
+#define VHD_FLAG_BM_WRITE_PENDING    2
+#define VHD_FLAG_BM_READ_PENDING     4
+#define VHD_FLAG_BM_LOCKED           8
+
+#define VHD_FLAG_REQ_UPDATE_BAT      1
+#define VHD_FLAG_REQ_UPDATE_BITMAP   2
+#define VHD_FLAG_REQ_QUEUED          4
+#define VHD_FLAG_REQ_FINISHED        8
+
+#define VHD_FLAG_TX_LIVE             1
+#define VHD_FLAG_TX_UPDATE_BAT       2
+
+typedef uint8_t vhd_flag_t;
+
+struct vhd_state;
+struct vhd_request;
+
+struct vhd_req_list {
+       struct vhd_request       *head;
+       struct vhd_request       *tail;
+};
+
+struct vhd_transaction {
+       int                       error;
+       int                       closed;
+       int                       started;
+       int                       finished;
+       vhd_flag_t                status;
+       struct vhd_req_list       requests;
+};
+
+struct vhd_request {
+       int                       error;
+       uint8_t                   op;
+       vhd_flag_t                flags;
+       td_request_t              treq;
+       struct tiocb              tiocb;
+       struct vhd_state         *state;
+       struct vhd_request       *next;
+       struct vhd_transaction   *tx;
+};
+
+struct vhd_bat_state {
+       vhd_bat_t                 bat;
+       vhd_batmap_t              batmap;
+       vhd_flag_t                status;
+       uint32_t                  pbw_blk;     /* blk num of pending write */
+       uint64_t                  pbw_offset;  /* file offset of same */
+       struct vhd_request        req;         /* for writing bat table */
+       struct vhd_request        zero_req;    /* for initializing bitmaps */
+       char                     *bat_buf;
+};
+
+struct vhd_bitmap {
+       u32                       blk;
+       u64                       seqno;       /* lru sequence number */
+       vhd_flag_t                status;
+
+       char                     *map;         /* map should only be modified
+                                               * in finish_bitmap_write */
+       char                     *shadow;      /* in-memory bitmap changes are 
+                                               * made to shadow and copied to
+                                               * map only after having been
+                                               * flushed to disk */
+       struct vhd_transaction    tx;          /* transaction data structure
+                                               * encapsulating data, bitmap, 
+                                               * and bat writes */
+       struct vhd_req_list       queue;       /* data writes waiting for next
+                                               * transaction */
+       struct vhd_req_list       waiting;     /* pending requests that cannot
+                                               * be serviced until this bitmap
+                                               * is read from disk */
+       struct vhd_request        req;
+};
+
+struct vhd_state {
+       vhd_flag_t                flags;
+
+        /* VHD stuff */
+       vhd_context_t             vhd;
+       u32                       spp;         /* sectors per page */
+        u32                       spb;         /* sectors per block */
+        u64                       next_db;     /* pointer to the next 
+                                               * (unallocated) datablock */
+
+       struct vhd_bat_state      bat;
+
+       u64                       bm_lru;      /* lru sequence number */
+       u32                       bm_secs;     /* size of bitmap, in sectors */
+       struct vhd_bitmap        *bitmap[VHD_CACHE_SIZE];
+
+       int                       bm_free_count;
+       struct vhd_bitmap        *bitmap_free[VHD_CACHE_SIZE];
+       struct vhd_bitmap         bitmap_list[VHD_CACHE_SIZE];
+
+       int                       vreq_free_count;
+       struct vhd_request       *vreq_free[VHD_REQS_DATA];
+       struct vhd_request        vreq_list[VHD_REQS_DATA];
+
+       td_driver_t              *driver;
+
+       uint64_t                  queued;
+       uint64_t                  completed;
+       uint64_t                  returned;
+       uint64_t                  reads;
+       uint64_t                  read_size;
+       uint64_t                  writes;
+       uint64_t                  write_size;
+};
+
+#define test_vhd_flag(word, flag)  ((word) & (flag))
+#define set_vhd_flag(word, flag)   ((word) |= (flag))
+#define clear_vhd_flag(word, flag) ((word) &= ~(flag))
+
+#define bat_entry(s, blk)          ((s)->bat.bat.bat[(blk)])
+
+static void vhd_complete(void *, struct tiocb *, int);
+static void finish_data_transaction(struct vhd_state *, struct vhd_bitmap *);
+
+static struct vhd_state  *_vhd_master;
+static unsigned long      _vhd_zsize;
+static char              *_vhd_zeros;
+
+static int
+vhd_initialize(struct vhd_state *s)
+{
+       if (_vhd_zeros)
+               return 0;
+
+       _vhd_zsize = 2 * getpagesize();
+       if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE))
+               _vhd_zsize += VHD_BLOCK_SIZE;
+
+       _vhd_zeros = mmap(0, _vhd_zsize, PROT_READ,
+                         MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+       if (_vhd_zeros == MAP_FAILED) {
+               EPRINTF("vhd_initialize failed: %d\n", -errno);
+               _vhd_zeros = NULL;
+               _vhd_zsize = 0;
+               return -errno;
+       }
+
+       _vhd_master = s;
+       return 0;
+}
+
+static void
+vhd_free(struct vhd_state *s)
+{
+       if (_vhd_master != s || !_vhd_zeros)
+               return;
+
+       munmap(_vhd_zeros, _vhd_zsize);
+       _vhd_zsize  = 0;
+       _vhd_zeros  = NULL;
+       _vhd_master = NULL;
+}
+
+static char *
+_get_vhd_zeros(const char *func, unsigned long size)
+{
+       if (!_vhd_zeros || _vhd_zsize < size) {
+               EPRINTF("invalid zero request from %s: %lu, %lu, %p\n",
+                       func, size, _vhd_zsize, _vhd_zeros);
+               ASSERT(0);
+       }
+
+       return _vhd_zeros;
+}
+
+#define vhd_zeros(size)        _get_vhd_zeros(__func__, size)
+
+static inline void
+set_batmap(struct vhd_state *s, uint32_t blk)
+{
+       if (s->bat.batmap.map) {
+               vhd_batmap_set(&s->vhd, &s->bat.batmap, blk);
+               DBG(TLOG_DBG, "block 0x%x completely full\n", blk);
+       }
+}
+
+static inline int
+test_batmap(struct vhd_state *s, uint32_t blk)
+{
+       if (!s->bat.batmap.map)
+               return 0;
+       return vhd_batmap_test(&s->vhd, &s->bat.batmap, blk);
+}
+
+static int
+vhd_kill_footer(struct vhd_state *s)
+{
+       int err;
+       off64_t end;
+       char *zeros;
+
+       if (s->vhd.footer.type == HD_TYPE_FIXED)
+               return 0;
+
+       err = posix_memalign((void **)&zeros, 512, 512);
+       if (err)
+               return -err;
+
+       err = 1;
+       memset(zeros, 0xc7c7c7c7, 512);
+
+       if ((end = lseek64(s->vhd.fd, 0, SEEK_END)) == -1)
+               goto fail;
+
+       if (lseek64(s->vhd.fd, (end - 512), SEEK_SET) == -1)
+               goto fail;
+
+       if (write(s->vhd.fd, zeros, 512) != 512)
+               goto fail;
+
+       err = 0;
+
+ fail:
+       free(zeros);
+       if (err)
+               return (errno ? -errno : -EIO);
+       return 0;
+}
+
+static inline int
+find_next_free_block(struct vhd_state *s)
+{
+       int err;
+       off64_t eom;
+       uint32_t i, entry;
+
+       err = vhd_end_of_headers(&s->vhd, &eom);
+       if (err)
+               return err;
+
+       s->next_db = secs_round_up(eom);
+
+       for (i = 0; i < s->bat.bat.entries; i++) {
+               entry = bat_entry(s, i);
+               if (entry != DD_BLK_UNUSED && entry >= s->next_db)
+                       s->next_db = entry + s->spb + s->bm_secs;
+       }
+
+       return 0;
+}
+
+static void
+vhd_free_bat(struct vhd_state *s)
+{
+       free(s->bat.bat.bat);
+       free(s->bat.batmap.map);
+       free(s->bat.bat_buf);
+       memset(&s->bat, 0, sizeof(struct vhd_bat));
+}
+
+static int
+vhd_initialize_bat(struct vhd_state *s)
+{
+       int err, psize, batmap_required, i;
+
+       memset(&s->bat, 0, sizeof(struct vhd_bat));
+
+       psize = getpagesize();
+
+       err = vhd_read_bat(&s->vhd, &s->bat.bat);
+       if (err) {
+               EPRINTF("%s: reading bat: %d\n", s->vhd.file, err);
+               return err;
+       }
+
+       batmap_required = 1;
+       if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_RDONLY)) {
+               batmap_required = 0;
+       } else {
+               err = find_next_free_block(s);
+               if (err)
+                       goto fail;
+       }
+
+       if (vhd_has_batmap(&s->vhd)) {
+               for (i = 0; i < VHD_BATMAP_MAX_RETRIES; i++) {
+                       err = vhd_read_batmap(&s->vhd, &s->bat.batmap);
+                       if (err) {
+                               EPRINTF("%s: reading batmap: %d\n",
+                                               s->vhd.file, err);
+                               if (batmap_required)
+                                       goto fail;
+                       } else {
+                               break;
+                       }
+               }
+               if (err)
+                       EPRINTF("%s: ignoring non-critical batmap error\n",
+                                       s->vhd.file);
+       }
+
+       err = posix_memalign((void **)&s->bat.bat_buf,
+                            VHD_SECTOR_SIZE, VHD_SECTOR_SIZE);
+       if (err) {
+               s->bat.bat_buf = NULL;
+               goto fail;
+       }
+
+       return 0;
+
+fail:
+       vhd_free_bat(s);
+       return err;
+}
+
+static void
+vhd_free_bitmap_cache(struct vhd_state *s)
+{
+       int i;
+       struct vhd_bitmap *bm;
+
+       for (i = 0; i < VHD_CACHE_SIZE; i++) {
+               bm = s->bitmap_list + i;
+               free(bm->map);
+               free(bm->shadow);
+               s->bitmap_free[i] = NULL;
+       }
+
+       memset(s->bitmap_list, 0, sizeof(struct vhd_bitmap) * VHD_CACHE_SIZE);
+}
+
+static int
+vhd_initialize_bitmap_cache(struct vhd_state *s)
+{
+       int i, err, map_size;
+       struct vhd_bitmap *bm;
+
+       memset(s->bitmap_list, 0, sizeof(struct vhd_bitmap) * VHD_CACHE_SIZE);
+
+       s->bm_lru        = 0;
+       map_size         = vhd_sectors_to_bytes(s->bm_secs);
+       s->bm_free_count = VHD_CACHE_SIZE;
+
+       for (i = 0; i < VHD_CACHE_SIZE; i++) {
+               bm = s->bitmap_list + i;
+
+               err = posix_memalign((void **)&bm->map, 512, map_size);
+               if (err) {
+                       bm->map = NULL;
+                       goto fail;
+               }
+
+               err = posix_memalign((void **)&bm->shadow, 512, map_size);
+               if (err) {
+                       bm->shadow = NULL;
+                       goto fail;
+               }
+
+               memset(bm->map, 0, map_size);
+               memset(bm->shadow, 0, map_size);
+               s->bitmap_free[i] = bm;
+       }
+
+       return 0;
+
+fail:
+       vhd_free_bitmap_cache(s);
+       return err;
+}
+
+static int
+vhd_initialize_dynamic_disk(struct vhd_state *s)
+{
+       int err;
+
+       err = vhd_get_header(&s->vhd);
+       if (err) {
+               if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
+                       EPRINTF("Error reading VHD DD header.\n");
+               return err;
+       }
+
+       if (s->vhd.header.hdr_ver != 0x00010000) {
+               EPRINTF("unsupported header version! (0x%x)\n",
+                       s->vhd.header.hdr_ver);
+               return -EINVAL;
+       }
+
+       s->spp     = getpagesize() >> VHD_SECTOR_SHIFT;
+       s->spb     = s->vhd.header.block_size >> VHD_SECTOR_SHIFT;
+       s->bm_secs = secs_round_up_no_zero(s->spb >> 3);
+
+       if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_NO_CACHE))
+               return 0;
+
+       err = vhd_initialize_bat(s);
+       if (err)
+               return err;
+
+       err = vhd_initialize_bitmap_cache(s);
+       if (err) {
+               vhd_free_bat(s);
+               return err;
+       }
+
+       return 0;
+}
+
+static int
+vhd_check_version(struct vhd_state *s)
+{
+       if (strncmp(s->vhd.footer.crtr_app, "tap", 3))
+               return 0;
+
+       if (s->vhd.footer.crtr_ver > VHD_CURRENT_VERSION) {
+               if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
+                       EPRINTF("WARNING: %s vhd creator version 0x%08x, "
+                               "but only versions up to 0x%08x are "
+                               "supported for IO\n", s->vhd.file,
+                               s->vhd.footer.crtr_ver, VHD_CURRENT_VERSION);
+
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static void
+vhd_log_open(struct vhd_state *s)
+{
+       char buf[5];
+       uint32_t i, allocated, full;
+
+       if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
+               return;
+
+       snprintf(buf, sizeof(buf), "%s", s->vhd.footer.crtr_app);
+       if (!vhd_type_dynamic(&s->vhd)) {
+               DPRINTF("%s version: %s 0x%08x\n",
+                       s->vhd.file, buf, s->vhd.footer.crtr_ver);
+               return;
+       }
+
+       allocated = 0;
+       full      = 0;
+
+       for (i = 0; i < s->bat.bat.entries; i++) {
+               if (bat_entry(s, i) != DD_BLK_UNUSED)
+                       allocated++;
+               if (test_batmap(s, i))
+                       full++;
+       }
+
+       DPRINTF("%s version: %s 0x%08x, b: %u, a: %u, f: %u, n: %"PRIu64"\n",
+               s->vhd.file, buf, s->vhd.footer.crtr_ver, s->bat.bat.entries,
+               allocated, full, s->next_db);
+}
+
+static int
+__vhd_open(td_driver_t *driver, const char *name, vhd_flag_t flags)
+{
+        int i, o_flags, err;
+       struct vhd_state *s;
+
+        DBG(TLOG_INFO, "vhd_open: %s\n", name);
+       if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT))
+               libvhd_set_log_level(1);
+
+       s = (struct vhd_state *)driver->data;
+       memset(s, 0, sizeof(struct vhd_state));
+
+       s->flags  = flags;
+       s->driver = driver;
+
+       err = vhd_initialize(s);
+       if (err)
+               return err;
+
+       o_flags = ((test_vhd_flag(flags, VHD_FLAG_OPEN_RDONLY)) ? 
+                  VHD_OPEN_RDONLY : VHD_OPEN_RDWR);
+
+       err = vhd_open(&s->vhd, name, o_flags);
+       if (err) {
+               libvhd_set_log_level(1);
+               err = vhd_open(&s->vhd, name, o_flags);
+               if (err) {
+                       EPRINTF("Unable to open [%s] (%d)!\n", name, err);
+                       return err;
+               }
+       }
+
+       err = vhd_check_version(s);
+       if (err)
+               goto fail;
+
+       s->spb = s->spp = 1;
+
+       if (vhd_type_dynamic(&s->vhd)) {
+               err = vhd_initialize_dynamic_disk(s);
+               if (err)
+                       goto fail;
+       }
+
+       vhd_log_open(s);
+
+       SPB = s->spb;
+
+       s->vreq_free_count = VHD_REQS_DATA;
+       for (i = 0; i < VHD_REQS_DATA; i++)
+               s->vreq_free[i] = s->vreq_list + i;
+
+       driver->info.size        = s->vhd.footer.curr_size >> VHD_SECTOR_SHIFT;
+       driver->info.sector_size = VHD_SECTOR_SIZE;
+       driver->info.info        = 0;
+
+        DBG(TLOG_INFO, "vhd_open: done (sz:%"PRIu64", sct:%lu, inf:%u)\n",
+           driver->info.size, driver->info.sector_size, driver->info.info);
+
+       if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT) && 
+           !test_vhd_flag(flags, VHD_FLAG_OPEN_RDONLY)) {
+               err = vhd_kill_footer(s);
+               if (err) {
+                       DPRINTF("ERROR killing footer: %d\n", err);
+                       goto fail;
+               }
+               s->writes++;
+       }
+
+        return 0;
+
+ fail:
+       vhd_free_bat(s);
+       vhd_free_bitmap_cache(s);
+       vhd_close(&s->vhd);
+       vhd_free(s);
+       return err;
+}
+
+static int
+_vhd_open(td_driver_t *driver, const char *name, td_flag_t flags)
+{
+       vhd_flag_t vhd_flags = 0;
+
+       if (flags & TD_OPEN_RDONLY)
+               vhd_flags |= VHD_FLAG_OPEN_RDONLY;
+       if (flags & TD_OPEN_QUIET)
+               vhd_flags |= VHD_FLAG_OPEN_QUIET;
+       if (flags & TD_OPEN_STRICT)
+               vhd_flags |= VHD_FLAG_OPEN_STRICT;
+       if (flags & TD_OPEN_QUERY)
+               vhd_flags |= (VHD_FLAG_OPEN_QUERY  |
+                             VHD_FLAG_OPEN_QUIET  |
+                             VHD_FLAG_OPEN_RDONLY |
+                             VHD_FLAG_OPEN_NO_CACHE);
+
+       /* pre-allocate for all but NFS and LVM storage */
+       if (driver->storage != TAPDISK_STORAGE_TYPE_NFS &&
+           driver->storage != TAPDISK_STORAGE_TYPE_LVM)
+               vhd_flags |= VHD_FLAG_OPEN_PREALLOCATE;
+
+       return __vhd_open(driver, name, vhd_flags);
+}
+
+static void
+vhd_log_close(struct vhd_state *s)
+{
+       uint32_t i, allocated, full;
+
+       if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
+               return;
+
+       allocated = 0;
+       full      = 0;
+
+       for (i = 0; i < s->bat.bat.entries; i++) {
+               if (bat_entry(s, i) != DD_BLK_UNUSED)
+                       allocated++;
+               if (test_batmap(s, i))
+                       full++;
+       }
+
+       DPRINTF("%s: b: %u, a: %u, f: %u, n: %"PRIu64"\n",
+               s->vhd.file, s->bat.bat.entries, allocated, full, s->next_db);
+}
+
+static int
+_vhd_close(td_driver_t *driver)
+{
+       int err;
+       struct vhd_state *s;
+       struct vhd_bitmap *bm;
+       
+       DBG(TLOG_WARN, "vhd_close\n");
+       s = (struct vhd_state *)driver->data;
+
+       /* don't write footer if tapdisk is read-only */
+       if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_RDONLY))
+               goto free;
+       
+       /* 
+        * write footer if:
+        *   - we killed it on open (opened with strict) 
+        *   - we've written data since opening
+        */
+       if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_STRICT) || s->writes) {
+               memcpy(&s->vhd.bat, &s->bat.bat, sizeof(vhd_bat_t));
+               err = vhd_write_footer(&s->vhd, &s->vhd.footer);
+               memset(&s->vhd.bat, 0, sizeof(vhd_bat_t));
+
+               if (err)
+                       EPRINTF("writing %s footer: %d\n", s->vhd.file, err);
+
+               if (!vhd_has_batmap(&s->vhd))
+                       goto free;
+
+               err = vhd_write_batmap(&s->vhd, &s->bat.batmap);
+               if (err)
+                       EPRINTF("writing %s batmap: %d\n", s->vhd.file, err);
+       }
+
+ free:
+       vhd_log_close(s);
+       vhd_free_bat(s);
+       vhd_free_bitmap_cache(s);
+       vhd_close(&s->vhd);
+       vhd_free(s);
+
+       memset(s, 0, sizeof(struct vhd_state));
+
+       return 0;
+}
+
+int
+vhd_validate_parent(td_driver_t *child_driver,
+                   td_driver_t *parent_driver, td_flag_t flags)
+{
+       struct stat stats;
+       struct vhd_state *child  = (struct vhd_state *)child_driver->data;
+       struct vhd_state *parent;
+
+       if (parent_driver->type != DISK_TYPE_VHD) {
+               if (child_driver->type != DISK_TYPE_VHD)
+                       return -EINVAL;
+               if (child->vhd.footer.type != HD_TYPE_DIFF)
+                       return -EINVAL;
+               if (!vhd_parent_raw(&child->vhd))
+                       return -EINVAL;
+               return 0;
+       }
+
+       parent = (struct vhd_state *)parent_driver->data;
+
+       /* 
+        * This check removed because of cases like:
+        *   - parent VHD marked as 'hidden'
+        *   - parent VHD modified during coalesce
+        */
+       /*
+       if (stat(parent->vhd.file, &stats)) {
+               DPRINTF("ERROR stating parent file %s\n", parent->vhd.file);
+               return -errno;
+       }
+
+       if (child->hdr.prt_ts != vhd_time(stats.st_mtime)) {
+               DPRINTF("ERROR: parent file has been modified since "
+                       "snapshot.  Child image no longer valid.\n");
+               return -EINVAL;
+       }
+       */
+
+       if (uuid_compare(child->vhd.header.prt_uuid, parent->vhd.footer.uuid)) {
+               DPRINTF("ERROR: %s: %s, %s: parent uuid has changed since "
+                       "snapshot.  Child image no longer valid.\n",
+                       __func__, child->vhd.file, parent->vhd.file);
+               return -EINVAL;
+       }
+
+       /* TODO: compare sizes */
+       
+       return 0;
+}
+
+int
+vhd_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+       int err;
+       char *parent;
+       struct vhd_state *s;
+
+       DBG(TLOG_DBG, "\n");
+       memset(id, 0, sizeof(td_disk_id_t));
+
+       s = (struct vhd_state *)driver->data;
+
+       if (s->vhd.footer.type != HD_TYPE_DIFF)
+               return TD_NO_PARENT;
+
+       err = vhd_parent_locator_get(&s->vhd, &parent);
+       if (err)
+               return err;
+
+       id->name       = parent;
+       id->drivertype = DISK_TYPE_VHD;
+       if (vhd_parent_raw(&s->vhd)) {
+               DPRINTF("VHD: parent is raw\n");
+               id->drivertype = DISK_TYPE_AIO;
+       }
+       return 0;
+}
+
+static inline void
+clear_req_list(struct vhd_req_list *list)
+{
+       list->head = list->tail = NULL;
+}
+
+static inline void
+add_to_tail(struct vhd_req_list *list, struct vhd_request *e)
+{
+       if (!list->head) 
+               list->head = list->tail = e;
+       else 
+               list->tail = list->tail->next = e;
+}
+
+static inline int
+remove_from_req_list(struct vhd_req_list *list, struct vhd_request *e)
+{
+       struct vhd_request *i = list->head;
+
+       if (list->head == e) {
+               if (list->tail == e)
+                       clear_req_list(list);
+               else
+                       list->head = list->head->next;
+               return 0;
+       }
+
+       while (i->next) {
+               if (i->next == e) {
+                       if (list->tail == e) {
+                               i->next = NULL;
+                               list->tail = i;
+                       } else
+                               i->next = i->next->next;
+                       return 0;
+               }
+               i = i->next;
+       }
+
+       return -EINVAL;
+}
+
+static inline void
+init_vhd_request(struct vhd_state *s, struct vhd_request *req)
+{
+       memset(req, 0, sizeof(struct vhd_request));
+       req->state = s;
+}
+
+static inline void
+init_tx(struct vhd_transaction *tx)
+{
+       memset(tx, 0, sizeof(struct vhd_transaction));
+}
+
+static inline void
+add_to_transaction(struct vhd_transaction *tx, struct vhd_request *r)
+{
+       ASSERT(!tx->closed);
+
+       r->tx = tx;
+       tx->started++;
+       add_to_tail(&tx->requests, r);
+       set_vhd_flag(tx->status, VHD_FLAG_TX_LIVE);
+
+       DBG(TLOG_DBG, "blk: 0x%04"PRIx64", lsec: 0x%08"PRIx64", tx: %p, "
+           "started: %d, finished: %d, status: %u\n",
+           r->treq.sec / SPB, r->treq.sec, tx,
+           tx->started, tx->finished, tx->status);
+}
+
+static inline int
+transaction_completed(struct vhd_transaction *tx)
+{
+       return (tx->started == tx->finished);
+}
+
+static inline void
+init_bat(struct vhd_state *s)
+{
+       s->bat.req.tx     = NULL;
+       s->bat.req.next   = NULL;
+       s->bat.req.error  = 0;
+       s->bat.pbw_blk    = 0;
+       s->bat.pbw_offset = 0;
+       s->bat.status     = 0;
+}
+
+static inline void
+lock_bat(struct vhd_state *s)
+{
+       set_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED);
+}
+
+static inline void
+unlock_bat(struct vhd_state *s)
+{
+       clear_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED);
+}
+
+static inline int
+bat_locked(struct vhd_state *s)
+{
+       return test_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED);
+}
+
+static inline void
+init_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+       bm->blk    = 0;
+       bm->seqno  = 0;
+       bm->status = 0;
+       init_tx(&bm->tx);
+       clear_req_list(&bm->queue);
+       clear_req_list(&bm->waiting);
+       memset(bm->map, 0, vhd_sectors_to_bytes(s->bm_secs));
+       memset(bm->shadow, 0, vhd_sectors_to_bytes(s->bm_secs));
+       init_vhd_request(s, &bm->req);
+}
+
+static inline struct vhd_bitmap *
+get_bitmap(struct vhd_state *s, uint32_t block)
+{
+       int i;
+       struct vhd_bitmap *bm;
+
+       for (i = 0; i < VHD_CACHE_SIZE; i++) {
+               bm = s->bitmap[i];
+               if (bm && bm->blk == block)
+                       return bm;
+       }
+
+       return NULL;
+}
+
+static inline void
+lock_bitmap(struct vhd_bitmap *bm)
+{
+       set_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED);
+}
+
+static inline void
+unlock_bitmap(struct vhd_bitmap *bm)
+{
+       clear_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED);
+}
+
+static inline int
+bitmap_locked(struct vhd_bitmap *bm)
+{
+       return test_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED);
+}
+
+static inline int
+bitmap_valid(struct vhd_bitmap *bm)
+{
+       return !test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING);
+}
+
+static inline int
+bitmap_in_use(struct vhd_bitmap *bm)
+{
+       return (test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING)  ||
+               test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING) ||
+               test_vhd_flag(bm->tx.status, VHD_FLAG_TX_UPDATE_BAT) ||
+               bm->waiting.head || bm->tx.requests.head || bm->queue.head);
+}
+
+static inline int
+bitmap_full(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+       int i, n;
+
+       n = s->spb >> 3;
+       for (i = 0; i < n; i++)
+               if (bm->map[i] != (char)0xFF)
+                       return 0;
+
+       DBG(TLOG_DBG, "bitmap 0x%04x full\n", bm->blk);
+       return 1;
+}
+
+static struct vhd_bitmap *
+remove_lru_bitmap(struct vhd_state *s)
+{
+       int i, idx = 0;
+       u64 seq = s->bm_lru;
+       struct vhd_bitmap *bm, *lru = NULL;
+
+       for (i = 0; i < VHD_CACHE_SIZE; i++) {
+               bm = s->bitmap[i];
+               if (bm && bm->seqno < seq && !bitmap_locked(bm)) {
+                       idx = i;
+                       lru = bm;
+                       seq = lru->seqno;
+               }
+       }
+
+       if (lru) {
+               s->bitmap[idx] = NULL;
+               ASSERT(!bitmap_in_use(lru));
+       }
+
+       return  lru;
+}
+
+static int
+alloc_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap **bitmap, uint32_t blk)
+{
+       struct vhd_bitmap *bm;
+       
+       *bitmap = NULL;
+
+       if (s->bm_free_count > 0) {
+               bm = s->bitmap_free[--s->bm_free_count];
+       } else {
+               bm = remove_lru_bitmap(s);
+               if (!bm)
+                       return -EBUSY;
+       }
+
+       init_vhd_bitmap(s, bm);
+       bm->blk = blk;
+       *bitmap = bm;
+
+       return 0;
+}
+
+static inline uint64_t
+__bitmap_lru_seqno(struct vhd_state *s)
+{
+       int i;
+       struct vhd_bitmap *bm;
+
+       if (s->bm_lru == 0xffffffff) {
+               s->bm_lru = 0;
+               for (i = 0; i < VHD_CACHE_SIZE; i++) {
+                       bm = s->bitmap[i];
+                       if (bm) {
+                               bm->seqno >>= 1;
+                               if (bm->seqno > s->bm_lru)
+                                       s->bm_lru = bm->seqno;
+                       }
+               }
+       }
+
+       return ++s->bm_lru;
+}
+
+static inline void
+touch_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+       bm->seqno = __bitmap_lru_seqno(s);
+}
+
+static inline void
+install_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+       int i;
+       for (i = 0; i < VHD_CACHE_SIZE; i++) {
+               if (!s->bitmap[i]) {
+                       touch_bitmap(s, bm);
+                       s->bitmap[i] = bm;
+                       return;
+               }
+       }
+
+       ASSERT(0);
+}
+
+static inline void
+free_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+       int i;
+
+       for (i = 0; i < VHD_CACHE_SIZE; i++)
+               if (s->bitmap[i] == bm)
+                       break;
+
+       ASSERT(!bitmap_locked(bm));
+       ASSERT(!bitmap_in_use(bm));
+       ASSERT(i < VHD_CACHE_SIZE);
+
+       s->bitmap[i] = NULL;
+       s->bitmap_free[s->bm_free_count++] = bm;
+}
+
+static int
+read_bitmap_cache(struct vhd_state *s, uint64_t sector, uint8_t op)
+{
+       u32 blk, sec;
+       struct vhd_bitmap *bm;
+
+       /* in fixed disks, every block is present */
+       if (s->vhd.footer.type == HD_TYPE_FIXED) 
+               return VHD_BM_BIT_SET;
+
+       blk = sector / s->spb;
+       sec = sector % s->spb;
+
+       if (blk > s->vhd.header.max_bat_size) {
+               DPRINTF("ERROR: sec %"PRIu64" out of range, op = %d\n",
+                       sector, op);
+               return -EINVAL;
+       }
+
+       if (bat_entry(s, blk) == DD_BLK_UNUSED) {
+               if (op == VHD_OP_DATA_WRITE &&
+                   s->bat.pbw_blk != blk && bat_locked(s))
+                       return VHD_BM_BAT_LOCKED;
+
+               return VHD_BM_BAT_CLEAR;
+       }
+
+       if (test_batmap(s, blk)) {
+               DBG(TLOG_DBG, "batmap set for 0x%04x\n", blk);
+               return VHD_BM_BIT_SET;
+       }
+
+       bm = get_bitmap(s, blk);
+       if (!bm)
+               return VHD_BM_NOT_CACHED;
+
+       /* bump lru count */
+       touch_bitmap(s, bm);
+
+       if (test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING))
+               return VHD_BM_READ_PENDING;
+
+       return ((vhd_bitmap_test(&s->vhd, bm->map, sec)) ? 
+               VHD_BM_BIT_SET : VHD_BM_BIT_CLEAR);
+}
+
+static int
+read_bitmap_cache_span(struct vhd_state *s, 
+                      uint64_t sector, int nr_secs, int value)
+{
+       int ret;
+       u32 blk, sec;
+       struct vhd_bitmap *bm;
+
+       /* in fixed disks, every block is present */
+       if (s->vhd.footer.type == HD_TYPE_FIXED) 
+               return nr_secs;
+
+       sec = sector % s->spb;
+       blk = sector / s->spb;
+
+       if (test_batmap(s, blk))
+               return MIN(nr_secs, s->spb - sec);
+
+       bm  = get_bitmap(s, blk);
+       
+       ASSERT(bm && bitmap_valid(bm));
+
+       for (ret = 0; sec < s->spb && ret < nr_secs; sec++, ret++)
+               if (vhd_bitmap_test(&s->vhd, bm->map, sec) != value)
+                       break;
+
+       return ret;
+}
+
+static inline struct vhd_request *
+alloc_vhd_request(struct vhd_state *s)
+{
+       struct vhd_request *req = NULL;
+       
+       if (s->vreq_free_count > 0) {
+               req = s->vreq_free[--s->vreq_free_count];
+               ASSERT(req->treq.secs == 0);
+               init_vhd_request(s, req);
+               return req;
+       }
+
+       return NULL;
+}
+
+static inline void
+free_vhd_request(struct vhd_state *s, struct vhd_request *req)
+{
+       memset(req, 0, sizeof(struct vhd_request));
+       s->vreq_free[s->vreq_free_count++] = req;
+}
+
+static inline void
+aio_read(struct vhd_state *s, struct vhd_request *req, uint64_t offset)
+{
+       struct tiocb *tiocb = &req->tiocb;
+
+       td_prep_read(tiocb, s->vhd.fd, req->treq.buf,
+                    vhd_sectors_to_bytes(req->treq.secs),
+                    offset, vhd_complete, req);
+       td_queue_tiocb(s->driver, tiocb);
+
+       s->queued++;
+       s->reads++;
+       s->read_size += req->treq.secs;
+       TRACE(s);
+}
+
+static inline void
+aio_write(struct vhd_state *s, struct vhd_request *req, uint64_t offset)
+{
+       struct tiocb *tiocb = &req->tiocb;
+
+       td_prep_write(tiocb, s->vhd.fd, req->treq.buf,
+                     vhd_sectors_to_bytes(req->treq.secs),
+                     offset, vhd_complete, req);
+       td_queue_tiocb(s->driver, tiocb);
+
+       s->queued++;
+       s->writes++;
+       s->write_size += req->treq.secs;
+       TRACE(s);
+}
+
+static inline uint64_t
+reserve_new_block(struct vhd_state *s, uint32_t blk)
+{
+       int gap = 0;
+
+       ASSERT(!test_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED));
+
+       /* data region of segment should begin on page boundary */
+       if ((s->next_db + s->bm_secs) % s->spp)
+               gap = (s->spp - ((s->next_db + s->bm_secs) % s->spp));
+
+       s->bat.pbw_blk    = blk;
+       s->bat.pbw_offset = s->next_db + gap;
+
+       return s->next_db;
+}
+
+static int
+schedule_bat_write(struct vhd_state *s)
+{
+       int i;
+       u32 blk;
+       char *buf;
+       u64 offset;
+       struct vhd_request *req;
+
+       ASSERT(bat_locked(s));
+
+       req = &s->bat.req;
+       buf = s->bat.bat_buf;
+       blk = s->bat.pbw_blk;
+
+       init_vhd_request(s, req);
+       memcpy(buf, &bat_entry(s, blk - (blk % 128)), 512);
+
+       ((u32 *)buf)[blk % 128] = s->bat.pbw_offset;
+
+       for (i = 0; i < 128; i++)
+               BE32_OUT(&((u32 *)buf)[i]);
+
+       offset         = s->vhd.header.table_offset + (blk - (blk % 128)) * 4;
+       req->treq.secs = 1;
+       req->treq.buf  = buf;
+       req->op        = VHD_OP_BAT_WRITE;
+       req->next      = NULL;
+
+       aio_write(s, req, offset);
+       set_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED);
+
+       DBG(TLOG_DBG, "blk: 0x%04x, pbwo: 0x%08"PRIx64", "
+           "table_offset: 0x%08"PRIx64"\n", blk, s->bat.pbw_offset, offset);
+
+       return 0;
+}
+
+static void
+schedule_zero_bm_write(struct vhd_state *s,
+                      struct vhd_bitmap *bm, uint64_t lb_end)
+{
+       uint64_t offset;
+       struct vhd_request *req = &s->bat.zero_req;
+
+       init_vhd_request(s, req);
+
+       offset         = vhd_sectors_to_bytes(lb_end);
+       req->op        = VHD_OP_ZERO_BM_WRITE;
+       req->treq.sec  = s->bat.pbw_blk * s->spb;
+       req->treq.secs = (s->bat.pbw_offset - lb_end) + s->bm_secs;
+       req->treq.buf  = vhd_zeros(vhd_sectors_to_bytes(req->treq.secs));
+       req->next      = NULL;
+
+       DBG(TLOG_DBG, "blk: 0x%04x, writing zero bitmap at 0x%08"PRIx64"\n",
+           s->bat.pbw_blk, offset);
+
+       lock_bitmap(bm);
+       add_to_transaction(&bm->tx, req);
+       aio_write(s, req, offset);
+}
+
+static int
+update_bat(struct vhd_state *s, uint32_t blk)
+{
+       int err;
+       uint64_t lb_end;
+       struct vhd_bitmap *bm;
+
+       ASSERT(bat_entry(s, blk) == DD_BLK_UNUSED);
+       
+       if (bat_locked(s)) {
+               ASSERT(s->bat.pbw_blk == blk);
+               return 0;
+       }
+
+       /* empty bitmap could already be in
+        * cache if earlier bat update failed */
+       bm = get_bitmap(s, blk);
+       if (!bm) {
+               /* install empty bitmap in cache */
+               err = alloc_vhd_bitmap(s, &bm, blk);
+               if (err) 
+                       return err;
+
+               install_bitmap(s, bm);
+       }
+
+       lock_bat(s);
+       lb_end = reserve_new_block(s, blk);
+       schedule_zero_bm_write(s, bm, lb_end);
+       set_vhd_flag(bm->tx.status, VHD_FLAG_TX_UPDATE_BAT);
+
+       return 0;
+}
+
+static int
+allocate_block(struct vhd_state *s, uint32_t blk)
+{
+       char *zeros;
+       int err, gap;
+       uint64_t offset, size;
+       struct vhd_bitmap *bm;
+
+       ASSERT(bat_entry(s, blk) == DD_BLK_UNUSED);
+
+       if (bat_locked(s)) {
+               ASSERT(s->bat.pbw_blk == blk);
+               if (s->bat.req.error)
+                       return -EBUSY;
+               return 0;
+       }
+
+       gap            = 0;
+       s->bat.pbw_blk = blk;
+       offset         = vhd_sectors_to_bytes(s->next_db);
+
+       /* data region of segment should begin on page boundary */
+       if ((s->next_db + s->bm_secs) % s->spp) {
+               gap = (s->spp - ((s->next_db + s->bm_secs) % s->spp));
+               s->next_db += gap;
+       }
+
+       s->bat.pbw_offset = s->next_db;
+
+       DBG(TLOG_DBG, "blk: 0x%04x, pbwo: 0x%08"PRIx64"\n",
+           blk, s->bat.pbw_offset);
+
+       if (lseek(s->vhd.fd, offset, SEEK_SET) == (off_t)-1) {
+               ERR(errno, "lseek failed\n");
+               return -errno;
+       }
+
+       size = vhd_sectors_to_bytes(s->spb + s->bm_secs + gap);
+       err  = write(s->vhd.fd, vhd_zeros(size), size);
+       if (err != size) {
+               err = (err == -1 ? -errno : -EIO);
+               ERR(err, "write failed");
+               return err;
+       }
+
+       /* empty bitmap could already be in
+        * cache if earlier bat update failed */
+       bm = get_bitmap(s, blk);
+       if (!bm) {
+               /* install empty bitmap in cache */
+               err = alloc_vhd_bitmap(s, &bm, blk);
+               if (err) 
+                       return err;
+
+               install_bitmap(s, bm);
+       }
+
+       lock_bat(s);
+       lock_bitmap(bm);
+       schedule_bat_write(s);
+       add_to_transaction(&bm->tx, &s->bat.req);
+
+       return 0;
+}
+
+static int 
+schedule_data_read(struct vhd_state *s, td_request_t treq, vhd_flag_t flags)
+{
+       u64 offset;
+       u32 blk = 0, sec = 0;
+       struct vhd_bitmap  *bm;
+       struct vhd_request *req;
+
+       if (s->vhd.footer.type == HD_TYPE_FIXED) {
+               offset = vhd_sectors_to_bytes(treq.sec);
+               goto make_request;
+       }
+
+       blk    = treq.sec / s->spb;
+       sec    = treq.sec % s->spb;
+       bm     = get_bitmap(s, blk);
+       offset = bat_entry(s, blk);
+
+       ASSERT(offset != DD_BLK_UNUSED);
+       ASSERT(test_batmap(s, blk) || (bm && bitmap_valid(bm)));
+
+       offset += s->bm_secs + sec;
+       offset  = vhd_sectors_to_bytes(offset);
+
+ make_request:
+       req = alloc_vhd_request(s);
+       if (!req) 
+               return -EBUSY;
+
+       req->treq  = treq;
+       req->flags = flags;
+       req->op    = VHD_OP_DATA_READ;
+       req->next  = NULL;
+
+       aio_read(s, req, offset);
+
+       DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, sec: 0x%04x, "
+           "nr_secs: 0x%04x, offset: 0x%08"PRIx64", flags: 0x%08x, buf: %p\n",
+           s->vhd.file, treq.sec, blk, sec, treq.secs, offset, req->flags,
+           treq.buf);
+
+       return 0;
+}
+
+static int
+schedule_data_write(struct vhd_state *s, td_request_t treq, vhd_flag_t flags)
+{
+       int err;
+       u64 offset;
+       u32 blk = 0, sec = 0;
+       struct vhd_bitmap  *bm = NULL;
+       struct vhd_request *req;
+
+       if (s->vhd.footer.type == HD_TYPE_FIXED) {
+               offset = vhd_sectors_to_bytes(treq.sec);
+               goto make_request;
+       }
+
+       blk    = treq.sec / s->spb;
+       sec    = treq.sec % s->spb;
+       offset = bat_entry(s, blk);
+
+       if (test_vhd_flag(flags, VHD_FLAG_REQ_UPDATE_BAT)) {
+               if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE))
+                       err = allocate_block(s, blk);
+               else
+                       err = update_bat(s, blk);
+
+               if (err)
+                       return err;
+
+               offset = s->bat.pbw_offset;
+       }
+
+       offset += s->bm_secs + sec;
+       offset  = vhd_sectors_to_bytes(offset);
+
+ make_request:
+       req = alloc_vhd_request(s);
+       if (!req)
+               return -EBUSY;
+
+       req->treq  = treq;
+       req->flags = flags;
+       req->op    = VHD_OP_DATA_WRITE;
+       req->next  = NULL;
+
+       if (test_vhd_flag(flags, VHD_FLAG_REQ_UPDATE_BITMAP)) {
+               bm = get_bitmap(s, blk);
+               ASSERT(bm && bitmap_valid(bm));
+               lock_bitmap(bm);
+
+               if (bm->tx.closed) {
+                       add_to_tail(&bm->queue, req);
+                       set_vhd_flag(req->flags, VHD_FLAG_REQ_QUEUED);
+               } else
+                       add_to_transaction(&bm->tx, req);
+       }
+
+       aio_write(s, req, offset);
+
+       DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, sec: 0x%04x, "
+           "nr_secs: 0x%04x, offset: 0x%08"PRIx64", flags: 0x%08x\n",
+           s->vhd.file, treq.sec, blk, sec, treq.secs, offset, req->flags);
+
+       return 0;
+}
+
+static int 
+schedule_bitmap_read(struct vhd_state *s, uint32_t blk)
+{
+       int err;
+       u64 offset;
+       struct vhd_bitmap  *bm;
+       struct vhd_request *req = NULL;
+
+       ASSERT(vhd_type_dynamic(&s->vhd));
+
+       offset = bat_entry(s, blk);
+
+       ASSERT(offset != DD_BLK_UNUSED);
+       ASSERT(!get_bitmap(s, blk));
+
+       offset = vhd_sectors_to_bytes(offset);
+
+       err = alloc_vhd_bitmap(s, &bm, blk);
+       if (err)
+               return err;
+
+       req = &bm->req;
+       init_vhd_request(s, req);
+
+       req->treq.sec  = blk * s->spb;
+       req->treq.secs = s->bm_secs;
+       req->treq.buf  = bm->map;
+       req->treq.cb   = NULL;
+       req->op        = VHD_OP_BITMAP_READ;
+       req->next      = NULL;
+
+       aio_read(s, req, offset);
+       lock_bitmap(bm);
+       install_bitmap(s, bm);
+       set_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING);
+
+       DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, nr_secs: 0x%04x, "
+           "offset: 0x%08"PRIx64"\n", s->vhd.file, req->treq.sec, blk,
+           req->treq.secs, offset);
+
+       return 0;
+}
+
+static void
+schedule_bitmap_write(struct vhd_state *s, uint32_t blk)
+{
+       u64 offset;
+       struct vhd_bitmap  *bm;
+       struct vhd_request *req;
+
+       bm     = get_bitmap(s, blk);
+       offset = bat_entry(s, blk);
+
+       ASSERT(vhd_type_dynamic(&s->vhd));
+       ASSERT(bm && bitmap_valid(bm) &&
+              !test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING));
+
+       if (offset == DD_BLK_UNUSED) {
+               ASSERT(bat_locked(s) && s->bat.pbw_blk == blk);
+               offset = s->bat.pbw_offset;
+       }
+       
+       offset = vhd_sectors_to_bytes(offset);
+
+       req = &bm->req;
+       init_vhd_request(s, req);
+
+       req->treq.sec  = blk * s->spb;
+       req->treq.secs = s->bm_secs;
+       req->treq.buf  = bm->shadow;
+       req->treq.cb   = NULL;
+       req->op        = VHD_OP_BITMAP_WRITE;
+       req->next      = NULL;
+
+       aio_write(s, req, offset);
+       lock_bitmap(bm);
+       touch_bitmap(s, bm);     /* bump lru count */
+       set_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING);
+
+       DBG(TLOG_DBG, "%s: blk: 0x%04x, sec: 0x%08"PRIx64", nr_secs: 0x%04x, "
+           "offset: 0x%"PRIx64"\n", s->vhd.file, blk, req->treq.sec,
+           req->treq.secs, offset);
+}
+
+/* 
+ * queued requests will be submitted once the bitmap
+ * describing them is read and the requests are validated. 
+ */
+static int
+__vhd_queue_request(struct vhd_state *s, uint8_t op, td_request_t treq)
+{
+       u32 blk;
+       struct vhd_bitmap  *bm;
+       struct vhd_request *req;
+
+       ASSERT(vhd_type_dynamic(&s->vhd));
+
+       blk = treq.sec / s->spb;
+       bm  = get_bitmap(s, blk);
+
+       ASSERT(bm && test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING));
+
+       req = alloc_vhd_request(s);
+       if (!req)
+               return -EBUSY;
+
+       req->treq = treq;
+       req->op   = op;
+       req->next = NULL;
+
+       add_to_tail(&bm->waiting, req);
+       lock_bitmap(bm);
+
+       DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x nr_secs: 0x%04x, "
+           "op: %u\n", s->vhd.file, treq.sec, blk, treq.secs, op);
+
+       TRACE(s);
+       return 0;
+}
+
+static void
+vhd_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       struct vhd_state *s = (struct vhd_state *)driver->data;
+
+       DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", secs: 0x%04x (seg: %d)\n",
+           s->vhd.file, treq.sec, treq.secs, treq.sidx);
+
+       while (treq.secs) {
+               int err;
+               td_request_t clone;
+
+               err   = 0;
+               clone = treq;
+
+               switch (read_bitmap_cache(s, clone.sec, VHD_OP_DATA_READ)) {
+               case -EINVAL:
+                       err = -EINVAL;
+                       goto fail;
+
+               case VHD_BM_BAT_CLEAR:
+                       clone.secs = MIN(clone.secs, s->spb - (clone.sec % 
s->spb));
+                       td_forward_request(clone);
+                       break;
+
+               case VHD_BM_BIT_CLEAR:
+                       clone.secs = read_bitmap_cache_span(s, clone.sec, 
clone.secs, 0);
+                       td_forward_request(clone);
+                       break;
+
+               case VHD_BM_BIT_SET:
+                       clone.secs = read_bitmap_cache_span(s, clone.sec, 
clone.secs, 1);
+                       err = schedule_data_read(s, clone, 0);
+                       if (err)
+                               goto fail;
+                       break;
+
+               case VHD_BM_NOT_CACHED:

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] [xen-unstable] blktap2: a completely rewritten blktap implementation, Xen patchbot-unstable <=