WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH 4/4] (Refactored) Add libvdisk, and vdisk_tool

To: xen-devel@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-devel] [PATCH 4/4] (Refactored) Add libvdisk, and vdisk_tool
From: Ben Guthro <bguthro@xxxxxxxxxxxxxxx>
Date: Thu, 21 Jun 2007 13:28:04 -0400
Delivery-date: Thu, 21 Jun 2007 10:25:55 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
User-agent: Thunderbird 2.0.0.0 (X11/20070418)
[PATCH 4/4] (Refactored) Add libvdisk, and vdisk_tool
vdisk-support.patch
provides libvdisk, and vdisk_tool, as described in [PATCH 0/4]
Signed-off-by: Boris Ostrovsky <bostrovsky@xxxxxxxxxxxxxxx>
Signed-off-by: Ben Guthro <bguthro@xxxxxxxxxxxxxxx>

diff -r 75c61490cc06 tools/Makefile
--- a/tools/Makefile    Thu Jun 21 13:05:29 2007 -0400
+++ b/tools/Makefile    Thu Jun 21 13:05:31 2007 -0400
@@ -17,6 +17,7 @@ SUBDIRS-$(VTPM_TOOLS) += vtpm
 SUBDIRS-$(VTPM_TOOLS) += vtpm
 SUBDIRS-y += xenstat
 SUBDIRS-y += libaio
+SUBDIRS-y += vdisk
 SUBDIRS-y += blktap
 SUBDIRS-y += libfsimage
 SUBDIRS-$(XENFB_TOOLS) += xenfb
diff -r 75c61490cc06 tools/vdisk/Makefile
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/vdisk/Makefile      Thu Jun 21 13:05:45 2007 -0400
@@ -0,0 +1,65 @@
+#
+# Copyright (c) 2003-2007, Virtual Iron Software, Inc.
+#
+# Portions have been modified by Virtual Iron Software, Inc.
+# (c) 2007. This file and the modifications can be redistributed and/or
+# modified under the terms and conditions of the GNU General Public
+# License, version 2.1 and not any later version of the GPL, as published
+# by the Free Software Foundation.
+#
+XEN_ROOT = ../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+LIBVHD_SRC     = vhd.c vhd_utils.c
+LIBVDISK_SRC   = vdisk_utils.c vdisk_common.c
+TOOL_SRC       = vdisk_tool.c
+
+LIBAIO_DIR   = ../libaio/src
+BLKTAP_DIR  = ../blktap/drivers
+
+CFLAGS         = -O2 -fno-strict-aliasing -fPIC -Wall -Werror -rdynamic \
+               -D_FILE_OFFSET_BITS=64 \
+               -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -I./ \
+               -I$(LIBAIO_DIR) \
+               -I$(BLKTAP_DIR)
+
+LIB_LDFLAGS    = -dy -shared -L$(LIBAIO_DIR) -laio
+
+INSTALL                = /usr/bin/install
+
+all: default
+default: vdisk_tool libvdisk_vhd.so libvdisk.so
+
+
+%.o: %.c
+       $(CC) $(CFLAGS) -rdynamic  -c $< -o $@
+
+vdisk_tool: $(TOOL_SRC:%.c=%.o) libvdisk_vhd.so libvdisk.so
+       gcc $(LOCAL_CFLAGS) -o vdisk_tool -g $(TOOL_SRC) -L./ \
+               -I$(LIBAIO_DIR) \
+               -I$(BLKTAP_DIR) \
+               -L$(LIBAIO_DIR) -L. -lvdisk -ldl -laio
+
+libvdisk_vhd.so: $(LIBVHD_SRC:%.c=%.o) libvdisk.so
+       $(LD) $(LIB_LDFLAGS) -o $@ $^
+
+libvdisk.so: $(LIBVDISK_SRC:%.c=%.o)
+       $(LD) $(LIB_LDFLAGS) -o $@ $^
+
+install: all
+       $(INSTALL) -d $(DESTDIR)/usr/bin
+       $(INSTALL) -d $(DESTDIR)/usr/lib64
+       $(INSTALL) vdisk_tool $(DESTDIR)/usr/bin
+       $(INSTALL) libvdisk_vhd.so libvdisk.so $(DESTDIR)/usr/lib64
+       $(INSTALL) -d $(DESTDIR)/usr/include
+       for header in *.h; do $(INSTALL) $$header $(DESTDIR)/usr/include; done
+
+clean:
+       /bin/rm -f *.o libvdisk_vhd.so vdisk_tool libvdisk.so
+
+depend .depend dep:
+       $(CC) $(CFLAGS) -M $(LIBVDISK_SRC) $(LIBVHD_SRC) $(TOOL_SRC)> .depend
+
+ifeq (.depend,$(wildcard .depend))
+include .depend
+endif
diff -r 75c61490cc06 tools/vdisk/list.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/vdisk/list.h        Thu Jun 21 13:05:31 2007 -0400
@@ -0,0 +1,168 @@
+// Copy of /usr/include/linux/list.h that does not
+// depend on __KERNEL__ and _LVM_H_INCLUDE
+
+#ifndef _LIST_H
+#define _LIST_H
+
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+struct list_head {
+       struct list_head *next, *prev;
+};
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+       struct list_head name = LIST_HEAD_INIT(name)
+
+#define INIT_LIST_HEAD(ptr) do { \
+       (ptr)->next = (ptr); (ptr)->prev = (ptr); \
+} while (0)
+
+/*
+ * Insert a new entry between two known consecutive entries. 
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static __inline__ void __list_add(struct list_head * new,
+       struct list_head * prev,
+       struct list_head * next)
+{
+       next->prev = new;
+       new->next = next;
+       new->prev = prev;
+       prev->next = new;
+}
+
+/**
+ * list_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static __inline__ void list_add(struct list_head *new, struct list_head *head)
+{
+       __list_add(new, head, head->next);
+}
+
+/**
+ * list_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static __inline__ void list_add_tail(struct list_head *new, struct list_head 
*head)
+{
+       __list_add(new, head->prev, head);
+}
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static __inline__ void __list_del(struct list_head * prev,
+                                 struct list_head * next)
+{
+       next->prev = prev;
+       prev->next = next;
+}
+
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty on entry does not return true after this, the entry is in 
an undefined state.
+ */
+static __inline__ void list_del(struct list_head *entry)
+{
+       __list_del(entry->prev, entry->next);
+       entry->next = entry->prev = 0;
+}
+
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static __inline__ void list_del_init(struct list_head *entry)
+{
+       __list_del(entry->prev, entry->next);
+       INIT_LIST_HEAD(entry); 
+}
+
+/**
+ * list_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static __inline__ int list_empty(struct list_head *head)
+{
+       return head->next == head;
+}
+
+/**
+ * list_splice - join two lists
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static __inline__ void list_splice(struct list_head *list, struct list_head 
*head)
+{
+       struct list_head *first = list->next;
+
+       if (first != list) {
+               struct list_head *last = list->prev;
+               struct list_head *at = head->next;
+
+               first->prev = head;
+               head->next = first;
+
+               last->next = at;
+               at->prev = last;
+       }
+}
+
+/**
+ * list_entry - get the struct for this entry
+ * @ptr:       the &struct list_head pointer.
+ * @type:      the type of the struct this is embedded in.
+ * @member:    the name of the list_struct within the struct.
+ */
+#define list_entry(ptr, type, member) \
+       ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+
+/**
+ * list_for_each       -       iterate over a list
+ * @pos:       the &struct list_head to use as a loop counter.
+ * @head:      the head for your list.
+ */
+#define list_for_each(pos, head) \
+       for (pos = (head)->next; pos != (head); \
+               pos = pos->next)
+               
+/**
+ * list_for_each_safe  -       iterate over a list safe against removal of 
list entry
+ * @pos:       the &struct list_head to use as a loop counter.
+ * @n:         another &struct list_head to use as temporary storage
+ * @head:      the head for your list.
+ */
+#define list_for_each_safe(pos, n, head) \
+       for (pos = (head)->next, n = pos->next; pos != (head); \
+               pos = n, n = pos->next)
+
+
+
+#endif
diff -r 75c61490cc06 tools/vdisk/vdisk.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/vdisk/vdisk.h       Thu Jun 21 13:05:48 2007 -0400
@@ -0,0 +1,215 @@
+// Copyright (c) 2003-2007, Virtual Iron Software, Inc.
+//
+// Portions have been modified by Virtual Iron Software, Inc.
+// (c) 2007. This file and the modifications can be redistributed and/or
+// modified under the terms and conditions of the GNU General Public
+// License, version 2.1 and not any later version of the GPL, as published
+// by the Free Software Foundation.
+
+#ifndef __VDISK_H
+#define __VDISK_H
+
+#include <sys/types.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <linux/limits.h>
+#include <syslog.h>
+#include <libaio.h>
+#include "list.h"
+#include "tapaio.h"
+
+// vdisk_tool's operations
+#define VDISK_OP_CREATE    (1<<0)
+#define VDISK_OP_HEADERS   (1<<1)
+#define VDISK_OP_DUMP      (1<<2)
+#define VDISK_OP_MODIFY    (1<<3)
+
+// Return codes
+#define VID_BLOCK_MAPPED    (0)
+#define VID_BLOCK_NOTMAPPED (-1)
+#define VID_BLOCK_TOOBIG    (-2)
+#define VID_BLOCK_MAPERR    (-3)
+
+// IO operation codes
+#define VDISK_READ  (0)
+#define VDISK_WRITE (1)
+
+// Async IO macros
+#define VDISK_HASH_SZ        (2048)
+#define VDISK_HASH_IDX(x)    ((x) & (VDISK_HASH_SZ-1))
+#define VDISK_INVALID_HASH   (-1)
+#define REQUEST_ASYNC_FD     (1) // Should really be defined in kernel
+
+#define SECTOR_SIZE          (512)
+
+// vdisk device flags
+#define VDISK_SYNCIO_BUF     (1<<0)
+#define VDISK_RO             (1<<1)
+
+// vdisk file flags
+#define VDF_LEAF    (1<<0) // last COW child (writeable)
+
+// Statistics gathering
+#define        VDISK_STATS          (0)
+#define VDISK_SYNCIO_STATS   (0)
+
+#if VDISK_STATS
+#define        DO_STATS(x)     x
+#else
+#define        DO_STATS(x)
+#endif
+
+
+
+
+// Datatype for addressing host memory 
+#if defined __x86_64__
+typedef uint64_t addr_t;
+#else
+typedef uint32_t addr_t;
+#endif
+
+typedef        int file_t;
+
+// Forward declaration
+struct vdisk_dev;
+
+// Stores info about a pending async IO
+typedef struct pending_aio {
+       uint32_t block;
+       uint32_t num_blocks;
+       void *arg;
+       void *aiocb;
+       off_t off;
+       file_t fd;
+       int op;
+       int res;
+} pending_aio_t;
+
+// Hash that stores async IO data
+typedef struct vdisk_hash {
+       uint64_t key;
+       struct iocb io;
+       pending_aio_t pio;
+} vdisk_hash_t;
+
+// run data to allow coalescing of writes when doing posix_fadvise() sync/flush
+typedef struct vdisk_syncio {
+       int     is_set;
+       off_t   io_start;
+       off_t   io_len;
+#if VDISK_SYNCIO_STATS
+       unsigned long   total_writes;
+       unsigned long   contig_writes;
+       unsigned long   flush_size_sub1MB;
+       unsigned long   flush_size_sub2MB;
+       unsigned long   flush_size_sub4MB;
+       unsigned long   flush_size_sub8MB;
+       unsigned long   flush_size_ovr8MB;
+       unsigned long   flush_size_force;
+       time_t          last_dbg_print;
+#endif
+} vdisk_syncio_t;
+
+// Per-file structure
+typedef struct vd_file {
+       struct list_head vdf_list;
+       char name[PATH_MAX];
+       file_t fd;
+       int flags;
+       int batch_sz;           // number of blocks that are mapped sequentially
+       void *vdf;              // format-specific data
+       vdisk_syncio_t *syncio; // allows sync io to buffer in pagecache for 
+                               //  better io performance
+} vd_file_t;
+
+// Data describing format's properties (ops etc.)
+typedef struct vdf_data {
+       char ftype[8];                    // File name extension
+
+       int (*open)(struct vdisk_dev *vdisk, char *filename);
+       void (*close)(struct vdisk_dev *vdisk);
+       int (*map_block)(vd_file_t *vf, uint32_t *blockno, int num_blocks, 
+                        int op, void **arg);
+       int (*xfer_commit)(void *arg, int err);
+       int (*print_header)(vd_file_t *vf);
+       int (*parse_args)(int argc, int operations, char *argv[], void **optp);
+       int (*create_vdisk)(char *filename, void *optp);
+       int (*modify_vdisk)(struct vdisk_dev *vdisk, void *optp);
+       struct list_head vdfd_list; // connects to global format list
+} vdf_data_t;
+
+// Top-level datastructure
+typedef struct vdisk_dev {
+
+       struct vdisk_geom {
+               int cyls;
+               int heads;
+               int secs;
+       } geom;
+
+        ssize_t sz;      // Device size (bytes)
+       
+       int flags;
+
+       // head of vdisk files (vd_file_t) list
+       struct list_head vdf_head;
+
+       vdf_data_t *vdfd;
+
+       // AIO data
+       vdisk_hash_t hash[VDISK_HASH_SZ];
+       struct iocb *aio_submit[VDISK_HASH_SZ];
+       struct io_event aio_events[VDISK_HASH_SZ];
+       tap_aio_context_t   aio_ctx;
+       int use_aio;
+       int aio_fd;
+       int aio_cnt;
+
+       // Stats
+       uint64_t busyio;
+       uint64_t syncio;
+       uint64_t asyncio;
+       uint64_t tot_io;
+} vdisk_dev_t;
+
+struct program_props {
+       void *alloc_func;
+       void *free_func;
+       int out_target;
+};
+
+
+#define VDISK_OUT_STDERR (0)
+#define VDISK_OUT_SYSLOG (1)
+extern int vdisk_dbg_level;
+extern int vdisk_out_target;
+#define VIDDBG(n, fmt, args...) vdisk_log_error(n, __FILE__, __LINE__, fmt, 
##args)
+
+#define ASSERT(expr)                                                    \
+       ((expr) ? 0 :                                                   \
+        ({                                                             \
+                VIDDBG(0, "Assertion failed: %s\n", __STRING(expr));   \
+                abort();                                               \
+        }));
+
+extern int vdisk_pagesz; //4K
+
+extern void vdisk_log_error(int level, char *file, int line, char *fmt, ...);
+extern int vdf_read_state(vdisk_dev_t *vdisk, char *filename);
+extern int vdf_print_headers(vdisk_dev_t *vdisk, char *filename);
+extern int vdisk_register (vdf_data_t *vdfd);
+extern void vdisk_unregister (vdf_data_t *vdfd);
+extern int vdf_init(vdisk_dev_t *vdisk, char *fname);
+extern int vdisk_common_init(vdisk_dev_t *vdisk);
+extern int vdf_find_vdfd(vdisk_dev_t *vdisk, char *ftype);
+extern int vdisk_xfer_cb(vdisk_dev_t *vdisk, struct pending_aio *pio);
+extern int vdisk_rw(void *hdl, int64_t sector_num, 
+                   uint8_t *buf, int nb_sectors, int write, void *aiocb);
+extern void vdisk_alloc_init(void *alloc_func, void *free_func);
+extern int vdisk_init(vdisk_dev_t *vdisk, char *filename,
+                     struct program_props *props, uint8_t flags);
+extern void vdisk_fini(vdisk_dev_t *vdisk);
+
+#endif /* __VDISK_H */
diff -r 75c61490cc06 tools/vdisk/vdisk_common.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/vdisk/vdisk_common.c        Thu Jun 21 13:05:53 2007 -0400
@@ -0,0 +1,616 @@
+// Copyright (c) 2003-2007, Virtual Iron Software, Inc.
+//
+// Portions have been modified by Virtual Iron Software, Inc.
+// (c) 2007. This file and the modifications can be redistributed and/or
+// modified under the terms and conditions of the GNU General Public
+// License, version 2.1 and not any later version of the GPL, as published
+// by the Free Software Foundation.
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <dlfcn.h>
+
+#include "vdisk.h"
+#include "vdisk_utils.h"
+
+
+static int vdisk_initialized = 0;
+int vdisk_pagesz = 0;
+
+void vdisk_fini(vdisk_dev_t *vdisk)
+{
+       struct list_head *ptr;
+       vd_file_t *vdf;
+
+       // We may have already closed the device
+       if ((vdisk == NULL) || (vdisk->vdfd == NULL) ||
+           (vdisk->vdfd->close == NULL))
+               return;
+
+       list_for_each(ptr, &vdisk->vdf_head) {
+               vdf = list_entry(ptr, vd_file_t, vdf_list);
+               free(vdf->syncio);
+               vdf->syncio = NULL;
+       }
+
+       vdisk->vdfd->close(vdisk);
+}
+
+int vdisk_init(vdisk_dev_t *vdisk, char *filename,
+              struct program_props *props, uint8_t flags)
+{
+       int err;
+       char *fname;
+
+       vdisk_common_init(NULL/*XXX: ?? */);
+
+       if (props != NULL) {
+               // Set where output is directed
+               vdisk_out_target = props->out_target;
+               vdisk_alloc_init(props->alloc_func, props->free_func);
+       } else {
+               vdisk_out_target = VDISK_OUT_STDERR;
+               vdisk_alloc_init(NULL, NULL);
+       }
+
+       fname = strchr(filename, ':');
+       if (fname == NULL)
+               fname = filename;
+       else
+               fname++;
+
+       vdisk->flags = flags;
+
+       err = vdf_init(vdisk, fname);
+       if (err != 0) {
+               VIDDBG(0, "Can't initialize format's data for %s\n",
+                       filename);
+               return (err);
+       }
+
+       return (0);
+}
+
+int
+vdf_init(vdisk_dev_t *vdisk, char *fname) 
+{
+       char *ext;
+       int err;
+       struct list_head *ptr;
+       vd_file_t *vdf;
+
+       ext = strrchr(fname, '.');
+       if (ext == NULL) {
+               VIDDBG(0, "Can't determine file type for %s\n", fname);
+               return (EINVAL);
+       }
+
+       ext++; // Skip '.'
+
+       err = vdf_find_vdfd(vdisk, ext);
+       if (err) {
+               VIDDBG(0, "Can't find format's data\n");
+               return (err);
+       }
+
+       err = vdf_read_state(vdisk, fname);
+       if (err) {
+               VIDDBG(0, "failed to read headers\n");
+               return (-1);
+       }
+
+       if (vdisk->flags & VDISK_SYNCIO_BUF) {
+               list_for_each(ptr, &vdisk->vdf_head) {
+                       vdf = list_entry(ptr, vd_file_t, vdf_list);
+                       vdf->syncio = calloc( 1, sizeof(vdisk_syncio_t));
+                       if (!vdf->syncio) {
+                               VIDDBG(0, "vdisk_alloc_syncio_run_data() "
+                                      "failed '%s', thus no speed up\n",
+                                      strerror(errno));
+                       }
+               }
+       }
+
+       return (0);
+}
+
+int
+vdisk_map_block(struct vdisk_dev *dev, 
+               uint32_t *blockno,      /* IN/OUT */
+               int op,
+               vd_file_t **vf,
+               void **arg)
+{
+       struct list_head *ptr;
+       vd_file_t *vdf;
+       int res = VID_BLOCK_NOTMAPPED;
+
+       list_for_each(ptr, &dev->vdf_head) {
+
+               *vf = vdf = list_entry(ptr, vd_file_t, vdf_list);
+
+               res = dev->vdfd->map_block(vdf, blockno, 1, op, arg);
+               if (res == VID_BLOCK_MAPPED)
+                       return (res);           
+       }
+
+       if (op == VDISK_WRITE)
+               VIDDBG(0, "Couldn't map block %d\n", *blockno);
+
+       return (res);
+}
+
+int
+vdf_read_state(vdisk_dev_t *vdisk, char *filename)
+{
+       int err;
+       int i;
+
+       INIT_LIST_HEAD(&vdisk->vdf_head);
+
+       if (vdisk->use_aio) {
+               for (i=0;i<VDISK_HASH_SZ;i++)
+                       vdisk->hash[i].key = VDISK_INVALID_HASH;
+       
+               memset(&vdisk->aio_ctx.aio_ctx, 0, sizeof(io_context_t));
+               err = io_queue_init(100, &vdisk->aio_ctx.aio_ctx);
+               if (err) {
+                       VIDDBG(0, "io_queue_init() failed: %s. "
+                              " Async IO will not be available\n", 
+                              strerror(-1*err));
+                       vdisk->use_aio = 0;
+               }
+       }
+
+       err = vdisk->vdfd->open(vdisk, filename);
+       if (err) {
+               VIDDBG(0, "Problems opening vdisk %s (error %d)\n", 
+                      filename, err);
+               return (err);
+       }
+       return (0);
+}
+
+int
+vdf_print_headers(vdisk_dev_t *vdisk, char *filename)
+{
+       int err;
+       vd_file_t *vf;
+       //struct list_head *ptr;
+
+       err = vdf_read_state(vdisk, filename);
+       if (err) {
+               VIDDBG(0, "Failed to read state for %s\n", filename);
+               return (err);
+       }
+
+#if 0  
+       list_for_each(ptr, &vdisk->vdf_head) {
+
+               vf = list_entry(ptr, vd_file_t, vdf_list);
+               (void)vdisk->vdfd->print_header(vf->vdf);
+       }
+#endif
+       vf = list_entry(vdisk->vdf_head.next, vd_file_t, vdf_list);
+       (void)vdisk->vdfd->print_header(vf);
+
+       return (0);
+}
+
+int
+vdisk_xfer_cb(vdisk_dev_t *vdisk, struct pending_aio *pio)
+{
+       uint32_t blk;
+       int err = 0;
+
+       ASSERT(pio != NULL);
+
+       err = vdisk->vdfd->xfer_commit(pio->arg, pio->res);
+       if (err)
+               VIDDBG(0, "Failed to commit transfer (error %d)\n", err);
+       
+       if (pio->op == VDISK_WRITE) {
+               err = fsync(pio->fd);
+               if (err)
+                       VIDDBG(0, "fsync: %s\n", strerror(errno));
+       }
+       
+       /*
+        * posix_fadvise() (or, rather, kernel's sys_fadvise64_64())
+        * invalidates whole pages only.
+        */
+       err = posix_fadvise(pio->fd, (pio->off & (~((off_t)vdisk_pagesz-1))),
+                           (ssize_t)(pio->num_blocks<<9) + (off_t)vdisk_pagesz,
+                           POSIX_FADV_DONTNEED);
+       if (err)
+               VIDDBG(0, "posix_fadvise: %s\n", strerror(errno));
+       
+
+       for (blk=pio->block; blk < (pio->block + pio->num_blocks); blk++)
+               vdisk->hash[VDISK_HASH_IDX(blk)].key = VDISK_INVALID_HASH;
+
+       return (err);
+}
+
+
+int vdisk_rw(void *hdl, int64_t block, 
+            uint8_t *buf, int nb_blocks,
+            int op, void *aiocb)
+{
+        off_t offset;
+       unsigned long bytes;
+       uint32_t real_block, blk;
+       vd_file_t *vdf = NULL;
+       void *arg = NULL;
+       struct vdisk_dev *vdisk = (struct vdisk_dev *)hdl;
+       int i;
+       struct list_head *ptr;
+       int res = 0;
+       char *b = (char *)buf;
+       char *pool = NULL;
+       int batch;
+       int use_aio = vdisk->use_aio;
+       int busy = 0;
+       int hash_index;
+       int zero_blocks = 0;
+
+       VIDDBG(50, "block=0x%" PRIx64 ", nb_blocks=%d\n", 
+              block, nb_blocks);
+
+       if (((block + (nb_blocks-1)) << 9) >= vdisk->sz) {
+               return (-ENOSPC);
+       }
+
+       vdisk->tot_io++;
+
+       if (use_aio) {
+               // Check whether the hash has available slots and reserve them
+               // We reserve them as we go because we want to make sure that
+               // the request fits in the hash.
+               for (i=0, blk=block; i<nb_blocks; i++, blk++) {
+                       hash_index = VDISK_HASH_IDX(blk);
+                       VIDDBG(50, "block=0x%" PRIx64 ", nb_blocks=%d i=%d "
+                              "blk=0x%x, vdisk->hash.key[%d]=0x%" PRIx64 "\n", 
+                              block, nb_blocks, i,
+                              blk, hash_index, 
+                              vdisk->hash[hash_index].key);
+                       if (vdisk->hash[hash_index].key != VDISK_INVALID_HASH) {
+                               vdisk->busyio++;
+                               if (vdisk->hash[hash_index].key != blk)
+                                       busy = 1;
+                               use_aio = 0;
+                               break;
+                       }
+                       vdisk->hash[hash_index].key = blk;
+                       VIDDBG(50, "hash_index=%d, blk=%d\n", 
+                              hash_index, blk);
+               }
+
+               // We need to free hash entries that we've just reserved.
+               if (!use_aio) {
+                       uint32_t b;
+                       
+                       VIDDBG(50, "Freeing hash for block %" PRId64 "\n",
+                              block);
+                       if (blk != 0) {
+                               for (b=blk-1; b>=block; b--) {
+                                       hash_index = VDISK_HASH_IDX(b);
+                                       vdisk->hash[hash_index].key = 
+                                               VDISK_INVALID_HASH;
+                               }
+                       }
+                       VIDDBG(50, "Done\n");
+                       if (busy) {
+                               VIDDBG(50, "Busy\n");
+                               return (-EBUSY);
+                       }
+                       vdisk->syncio++;
+               }
+       }
+
+       // We can only transfer to/from an aligned buffer
+       if ((addr_t)buf & 511) {
+               b = pool = vdisk_malloc((nb_blocks+1) * 512);
+               if (pool == NULL) {
+                       VIDDBG(0, "Can't create buffer\n");
+                       return (-ENOMEM);
+               }
+               while ((addr_t)b & 511) b++;
+               VIDDBG(10, "Aligned buffer %p (pool %p, b %p)\n", buf, pool, b);
+
+               use_aio = 0;
+       }
+
+       i = 0; // block in the buf[]
+       while (nb_blocks>0) {
+
+               // Find largest contiguous set of blocks that we
+               // we can access in a single IO.
+
+               batch = nb_blocks;
+       again:
+               arg = NULL;
+               list_for_each(ptr, &vdisk->vdf_head) {
+                                       
+                       vdf = list_entry(ptr, vd_file_t, vdf_list);
+
+                       real_block = (uint32_t)block;
+
+                       // Make batch fit into a single vdf->batch_sz
+                       if ( ((block + batch - 1) & ~(vdf->batch_sz-1))
+                            != (block & ~(vdf->batch_sz-1)))
+                               batch = ( (block + vdf->batch_sz) & 
+                                         ~(vdf->batch_sz-1) )
+                                       - block;
+
+                       // Map the requested block set to address in the file   
                
+                       res = vdisk->vdfd->map_block(vdf, &real_block, 
+                                                    batch, op, &arg);
+
+                       if (res == VID_BLOCK_TOOBIG) {
+                               // Some blocks are mapped and some are not.
+                               // Need to try a smaller batch
+
+                               batch >>= 1;
+                               if (!batch) {
+                                       int j;
+                                       // Free hash entries
+                                       for (j=0,blk=block; j<nb_blocks; 
j++,blk++) {
+                                               hash_index = 
VDISK_HASH_IDX(blk);
+                                               
ASSERT(vdisk->hash[hash_index].key
+                                                      == blk);
+                                               vdisk->hash[hash_index].key = 
+                                                       VDISK_INVALID_HASH;
+                                       }
+
+                                       VIDDBG(0, "Inconsistent mapping 
error\n");
+                                       return EINVAL;
+                               }
+                               goto again;
+                       }
+
+                       if ((res != VID_BLOCK_NOTMAPPED) ||
+                           ((vdf->flags & VDF_LEAF) && (op == VDISK_WRITE)))
+                               break;
+               }
+
+               if (res != VID_BLOCK_MAPPED) {
+                       
+                       // Unallocated blocks return zeroes for reads
+                       if ((op == VDISK_READ) && (res == VID_BLOCK_NOTMAPPED)) 
{
+                               
+                               if (use_aio) {
+                                       int j;
+                                       // Free up hash entries
+                                       for (j=0,blk=block; j<batch; j++,blk++) 
{
+                                               hash_index = 
VDISK_HASH_IDX(blk);
+                                               
ASSERT(vdisk->hash[hash_index].key
+                                                      == blk);
+                                               vdisk->hash[hash_index].key = 
+                                                       VDISK_INVALID_HASH;
+                                       }
+                               }
+
+                               memset(&buf[i*512], 0, batch*512);
+                               i += batch;
+                               b += batch * 512;
+                               block += batch;
+                               nb_blocks -= batch;
+                               zero_blocks += batch;
+                               VIDDBG(10, "Skipping %d blocks\n", batch);
+                               continue;
+                       }
+
+                       VIDDBG(0, "Couldn't map block %d (%d)\n", 
+                              block, res);
+                       if (pool)
+                               vdisk_free(pool);
+                       return (-1*res);
+               }
+
+               VIDDBG(50, "mapped sector %" PRId64 " to block %d for read\n", 
+                      block, real_block);
+
+               // Offset in the file
+               offset = (uint64_t)real_block << 9;
+
+               if (use_aio)
+                       vdisk->asyncio++;
+
+               // Perform IO
+               if (op == VDISK_WRITE) {
+                       if (pool)
+                               memcpy(b, &buf[i*512], batch * 512); 
+                       if (!use_aio)
+                               bytes = vdisk_syncio(vdf->fd, b, batch * 512, 
+                                                    offset, VDISK_WRITE, 
vdf->syncio);
+                       else
+                               bytes = vdisk_asyncio(vdisk, block, vdf->fd, 
+                                                     b, batch * 512, offset, 
+                                                     arg, aiocb, VDISK_WRITE);
+               } else /* VDISK_READ */ {
+                       if (!use_aio) {
+                               bytes = vdisk_syncio(vdf->fd, b, batch * 512, 
+                                                    offset, VDISK_READ, NULL);
+                               if (pool)
+                                       memcpy(&buf[i*512], b, batch * 512);
+                       } else {
+                               bytes = vdisk_asyncio(vdisk, block, vdf->fd,
+                                                     b, batch * 512, offset, 
+                                                     arg, aiocb, VDISK_READ);
+                       }
+               }
+
+               if (bytes != batch * 512) {
+                       VIDDBG(0, "%s %ld bytes (block %d) instead of "
+                              "%d (%s)\n", (op==VDISK_WRITE)?"Wrote":"Read", 
+                              bytes, real_block, batch * 512, vdf->name);
+                       if ((signed long)bytes == -1)
+                               res = errno;
+               }
+       
+               if (!use_aio)
+                       if (vdisk->vdfd->xfer_commit(arg, res))
+                               VIDDBG(0, "Couldn't commit transfer\n");
+               
+               i += batch;
+               b += batch * 512;
+               block += batch;
+               nb_blocks -= batch;
+       }
+
+       if (pool)
+               vdisk_free(pool);
+
+       /*
+        * Returning number of processed bytes to caller who requested AIO 
+        * (vdisk->use_aio && aiocb) will tell him that there is no 
+        * need to wait for AIO completion
+        * There are two cases when this happens:
+        *  - We couldn't perform any AIOs (use_aio == 0)
+        *  - Some requests have been reads to unallocated blocks (and 
+        *    thus are read as zeroes). Note that if *some* blocks have been
+        *    sent as AIOs, the caller will need to wait for completions 
+        *    (and we return zero).
+        */
+       if (!use_aio)
+               return (i * 512); // 'i' is number of accessed sectors;
+       else if (vdisk->use_aio && aiocb && (zero_blocks != 0))
+               return (zero_blocks * 512); 
+       else
+               return (0);
+}
+
+LIST_HEAD(vdfd_head);
+
+// Register new file format
+int
+vdisk_register(vdf_data_t *new_vdfd)
+{
+       struct list_head *ptr;
+       vdf_data_t *vdfd;
+
+       list_for_each(ptr, &vdfd_head) {
+               vdfd = list_entry(ptr, vdf_data_t, vdfd_list);
+               if (vdfd == new_vdfd) {
+                       return (-1);
+               }
+       }
+
+       list_add(&new_vdfd->vdfd_list, &vdfd_head);
+       VIDDBG(10, "Registered \"%s\" format\n", new_vdfd->ftype);
+       return (0);
+}
+
+// Unregister file format
+void
+vdisk_unregister(vdf_data_t *vdfd)
+{
+       struct list_head *ptr;
+       
+       list_for_each(ptr, &vdfd_head) {
+               if (vdfd == list_entry(ptr, vdf_data_t, vdfd_list)) {
+                       list_del(&vdfd->vdfd_list);
+                       break;
+               }
+       }
+}
+
+// Find format-specific library, load it and call its init routine
+int
+vdisk_init_format(char *name)
+{
+       void *handle;
+       char libname[64];
+       char initfunc[32];
+       void (*init)();
+       char *err;
+
+       // Construct library name
+       (void)strcpy(libname, "libvdisk_");
+       (void)strcat(libname, name);
+       (void)strcat(libname, ".so");
+
+       handle = dlopen (libname, RTLD_LAZY);
+       if (!handle) {
+               VIDDBG(0, "%s\n", dlerror());
+               return (-1);
+       }
+
+       dlerror();    // Clear any existing error
+
+       // Construct init function name
+       (void)strcpy(initfunc, name);
+       (void)strcat(initfunc, "_init");
+
+       *(void **) (&init) = dlsym(handle, initfunc);
+       if ((err = dlerror()) != NULL)  {
+                      VIDDBG(0, "%s\n", err);
+                      return (-1);
+       }
+
+       // Call format-specific init routine
+       (*init)();
+
+       return (0);
+}
+
+int
+vdf_find_vdfd(vdisk_dev_t *vdisk, char *ftype)
+{
+       struct list_head *ptr;
+       vdf_data_t *vdfd;
+       int err;
+       int attempt = 0;
+
+       while (attempt < 2) {
+               list_for_each(ptr, &vdfd_head) {
+                       
+                       vdfd = list_entry(ptr, vdf_data_t, vdfd_list);
+                       
+                       if (!strcmp(vdfd->ftype, ftype)) {
+                               
+                               vdisk->vdfd = vdfd;
+                               return (0);
+                       }
+               }
+
+               if (attempt) {
+                       VIDDBG(0, "Unknown format %s\n", ftype);
+                       return (EINVAL);
+               }
+
+               // Didn't find vdfd for this extension, maybe we need
+               // to initialize it and try again.
+               err = vdisk_init_format(ftype);
+               if (err != 0) {
+                       VIDDBG(0, "Can't initialize format %s\n", ftype);
+                       return (err);
+               }
+               attempt++;
+       }
+
+       /*NOTREACHED*/
+       return (EINVAL);
+}
+
+int
+vdisk_common_init(vdisk_dev_t *vdisk)
+{
+       if (vdisk_initialized)
+               return (0);
+
+       INIT_LIST_HEAD(&vdfd_head);
+
+       vdisk_pagesz = getpagesize();
+
+       vdisk_initialized = 1;
+
+       return (0);
+}
diff -r 75c61490cc06 tools/vdisk/vdisk_tool.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/vdisk/vdisk_tool.c  Thu Jun 21 13:05:31 2007 -0400
@@ -0,0 +1,338 @@
+// Copyright (c) 2003-2007, Virtual Iron Software, Inc.
+//
+// Portions have been modified by Virtual Iron Software, Inc.
+// (c) 2007. This file and the modifications can be redistributed and/or
+// modified under the terms and conditions of the GNU General Public
+// License, version 2.1 and not any later version of the GPL, as published
+// by the Free Software Foundation.
+
+#define _GNU_SOURCE  // for strndup()
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdint.h>
+#include <getopt.h>
+
+#include "vdisk.h"
+
+extern int vdisk_init_format(char *);
+static char *supported_formats[] = {"vhd", NULL};
+
+int
+init_tool()
+{
+       int err;
+       int i;
+       
+       err = vdisk_common_init(NULL/*XXX: ?? */);
+       if (err) {
+               VIDDBG(0, "Failed to initialize vdisk\n");
+               return (err);
+       }
+       
+       for (i=0; ;i++) {
+               if (supported_formats[i] == NULL)
+                       break;
+               
+               err = vdisk_init_format(supported_formats[i]);
+               if (err) {
+                       VIDDBG(0, "Failed to initialize %s format\n",
+                               supported_formats[i]);
+                       return (err);
+               }
+       }
+       return (0);
+}
+
+static void
+print_usage(char *prog)
+{
+       int i;
+
+       fprintf(stderr, "Usage: %s OPTIONS -# <format-specific options> "
+               "<filename>\n", prog);
+       fprintf(stderr, 
+               " OPTIONS:\n"
+               "          [-f <format>] [-C] [-H] [-M] "
+               "[-D <block> [-b <num_blocks>] [-o outfile]]\n"
+                "      -C              Create a vdisk\n"
+                "      -H              Read vdisk headers from file\n" 
+                "      -M              Modify a vdisk\n"
+                "      -D              Dump a vhd\n"
+                "        block           first block to read (required)\n"
+                "        num_blocks      number of blocks to read. If not\n" 
+                "                          specified, whole file will be 
read\n"
+                "        outfile         output file. If not specified,\n" 
+                "                          stdout is used\n"                
+               " Supported formats: ");
+       for (i=0; ;i++) {
+               if (supported_formats[i] == NULL) {
+                       fprintf(stderr, "\n");
+                       break;
+               }
+               fprintf(stderr, "%s ", supported_formats[i]);
+       }
+}
+
+int
+main(int argc, char *argv[])
+{
+       char filename[PATH_MAX];
+       char *outfile = NULL;
+       char format[16] = "vhd";
+       int operations = 0; 
+       char c = 0;
+       extern char *optarg;
+       extern int optind, opterr, optopt;
+       vdisk_dev_t vdisk;
+       int err;
+       void *optp = NULL; // Format-specific options
+       char *file_fmt;
+       int i;
+       int first_block = 0, num_blocks = -1;
+       struct program_props props;
+       uint8_t flags;
+
+       //init_tool();
+
+       /* 
+        * Read the filename argument first -- we may need 
+        * it to determine format 
+        */
+       strcpy(filename, argv[argc-1]);
+       file_fmt = strrchr(filename, '.');
+
+       // See whether what we think is file's format is supported
+       if (file_fmt) {
+               file_fmt++; // Skip '.'
+               for (i=0; ;i++) {
+                       if (supported_formats[i] == NULL) {
+                               // Not a supported format, ignore suffix
+                               file_fmt = NULL;
+                               break;
+                       }
+                       
+                       if (!strcmp(file_fmt, supported_formats[i]))
+                               break;  // Found it
+               }
+       }
+
+       vdisk.vdfd = NULL;
+
+       while (c != '#') {
+
+               c = getopt(argc, argv, "f:CHMD:b:o:#");
+               if (c == -1)
+                       break;
+
+               switch (c) {
+               case 'f':
+                       strcpy(format, optarg);
+
+                       /*
+                        * If we either coudn't determine format from filename
+                        * argument or we thought we could but '-f' specifies
+                        * different format, we append appropriate suffix
+                        */
+                       if (!file_fmt || strcmp(format, file_fmt)) {
+                               (void)strcat(filename, ".");
+                               (void)strcat(filename, format);
+                               file_fmt = format;
+                       } 
+
+                       break;
+               case 'C':
+                       operations |= VDISK_OP_CREATE;
+                       break;
+               case 'H':
+                       /* File to read headers from */
+                       operations |= VDISK_OP_HEADERS;
+                       break;
+               case 'M':
+                       /* File to read headers from */
+                       operations |= VDISK_OP_MODIFY;
+                       break;
+               case 'D':
+                       first_block = atol(optarg);
+                       operations |= VDISK_OP_DUMP;
+                       break;
+               case 'b':
+                       num_blocks = atol(optarg);
+                       if (num_blocks < 0) {
+                               VIDDBG(0, "Number of blocks must be a "
+                                      "non-negative number\n");
+                               exit(1);
+                       }
+                       break;
+               case 'o':
+                       // Don't confuse vdisk with output file
+                       if (optarg == argv[argc-1]) {
+                               print_usage(argv[0]);
+                               exit(1);
+                       }
+                       outfile = strndup(optarg, strlen(optarg));
+                       if (outfile == NULL) {
+                               VIDDBG(0, "Out of memory\n");
+                               exit(1);
+                       }
+                       
+               case '#':
+
+                       if (file_fmt) {
+                               err = vdf_find_vdfd(&vdisk, file_fmt);
+                               if (err) {
+                                       VIDDBG(0, "Fail to initialize "
+                                               "format data for %s\n",
+                                               format);
+                                       return (err);
+                               }
+                       } else {
+                               VIDDBG(0, "Unspecified or unsupported 
format\n");
+                               print_usage(argv[0]);
+                               return (EINVAL);
+                       }
+
+                       if (vdisk.vdfd->parse_args(argc, operations,
+                                                   argv, &optp) != 0) {
+                               print_usage(argv[0]);
+                               return (EINVAL);
+                       }
+                       
+                       break;
+               default:
+                       print_usage(argv[0]);
+                       return (EINVAL);
+               }
+       }
+
+       /* 
+        * At least one operation type is needed and 
+        * filename needs to be specified
+        */
+       if (!operations || !file_fmt) {
+               print_usage(argv[0]);
+               return (EINVAL);
+       }
+
+       // XXX: We probably should have initialized by now
+       if (vdisk.vdfd == NULL) {
+               err = vdf_find_vdfd(&vdisk, file_fmt);
+               if (err) {
+                       VIDDBG(0, "Fail to initialize format data for %s\n",
+                               format);
+                       return (err);
+               }
+       }
+
+
+       // First create file, if requested
+       if (operations & VDISK_OP_CREATE) {
+               err = vdisk.vdfd->create_vdisk(filename, optp);
+               if (err) {
+                       VIDDBG(0, "Can't create file\n");
+                       return (err);
+               }
+       }
+        
+       props.alloc_func = NULL;
+       props.free_func = NULL;
+       props.out_target = VDISK_OUT_STDERR;
+
+       if (!(operations & VDISK_OP_CREATE) && 
+           !(operations & VDISK_OP_MODIFY))
+               flags = VDISK_RO;
+       else
+               flags = 0;
+
+       err = vdisk_init(&vdisk, filename, &props, flags);
+       if (err) {
+               VIDDBG(0, "Fail to initialize from file %s\n",
+                       format);
+               return (err);
+       }
+
+       if (operations & VDISK_OP_HEADERS) {
+               err = vdf_print_headers(&vdisk, filename);
+               if (err) {
+                       VIDDBG(0, "Can't read headers\n");
+                       return (err);
+               }
+       }
+
+       if (operations & VDISK_OP_MODIFY) {
+               err = vdisk.vdfd->modify_vdisk(&vdisk, optp);
+               if (err) {
+                       VIDDBG(0, "Can't modify headers\n");
+                       return (err);
+               }
+       }
+
+       if (operations & VDISK_OP_DUMP) {
+               uint8_t *buf, *p;
+               int bytes;
+               int chunk_log = 21; // 2MB
+               int nblocks;
+               int fd;
+               
+               // Open output file (use stdout if not specified)
+               if (outfile != NULL) {
+                       fd = open(outfile, O_RDWR|O_CREAT, 
+                                 S_IRUSR|S_IWUSR);
+                       if (fd == -1) {
+                               VIDDBG(0, "Can't open %s: %s\n",
+                                      outfile, strerror(errno));
+                               exit(1);
+                       }
+               } else
+                       fd = 1; // stdout
+
+               // Allocate 512b-aligned read buffer
+               p = malloc((1<<chunk_log) + 512);               
+               while (p == NULL) { // Try smaller chunks if we fail
+                       if (chunk_log == 0) {
+                               VIDDBG(0, "Can't allocate buffer\n");
+                               exit(1);
+                       }
+                       chunk_log--;
+                       p = malloc((1<<chunk_log) + 512);
+               }
+               buf = p;
+               while ((addr_t)buf & 511) buf++;
+
+               // nblocks per transfer
+               nblocks = (1<<chunk_log) >> 9;
+
+               // If number of blocks to read is not specified,
+               // read whole vdisk
+               if (num_blocks < 0) 
+                       num_blocks = vdisk.sz >> 9;
+
+               for (i=0; i<num_blocks; i+=nblocks) {
+
+                       // This could happen on last iteration
+                       if ((i+nblocks) > num_blocks)
+                               nblocks = num_blocks - i;
+
+                       bytes = vdisk_rw(&vdisk, first_block+i, buf, nblocks,
+                                        VDISK_READ, NULL);
+                       if (bytes != (nblocks << 9)) {
+                               VIDDBG(0, "vdisk_rw() returned %d\n", bytes);
+                               exit(1);
+                       }
+
+                       bytes = write(fd, buf, nblocks<<9);
+                       if (bytes == -1) {
+                               VIDDBG(0, "write: %s\n", strerror(errno));
+                               exit(1);
+                       }
+               }
+
+               free(p);
+       }
+       return 0;
+}
diff -r 75c61490cc06 tools/vdisk/vdisk_utils.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/vdisk/vdisk_utils.c Thu Jun 21 13:05:31 2007 -0400
@@ -0,0 +1,435 @@
+// Copyright (c) 2003-2007, Virtual Iron Software, Inc.
+//
+// Portions have been modified by Virtual Iron Software, Inc.
+// (c) 2007. This file and the modifications can be redistributed and/or
+// modified under the terms and conditions of the GNU General Public
+// License, version 2.1 and not any later version of the GPL, as published
+// by the Free Software Foundation.
+
+#define _GNU_SOURCE // for O_DIRECT
+#include <stdio.h>
+#include <stdlib.h> 
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <string.h>
+#include <stdarg.h>
+#include <libaio.h>
+#include <time.h>
+#include <limits.h>
+
+#include "list.h"
+#include "vdisk.h"
+#include "vdisk_utils.h"
+
+#define VDISK_MAX_ERRORS       (100)
+#define VDISK_ERR_STRING_LEN   (512)
+
+static void *(*vdisk_alloc_func)(size_t sz) = malloc;
+static void (*vdisk_free_func)(void *ptr) = free;
+
+int vdisk_dbg_level = 1;
+int vdisk_out_target = VDISK_OUT_STDERR; // where to print messages
+
+// Don't want to keep this on stack
+static char argstring[VDISK_ERR_STRING_LEN];
+
+// Data structure to help with message throttling
+struct vdisk_log_mgt {
+       int vdisk_err_cnt;
+       int interval;
+       int restart;
+       time_t last_error;
+       time_t next_check;
+};
+static struct vdisk_log_mgt vdisk_log = {
+       .vdisk_err_cnt = 0,
+       .restart       = 0,
+       .interval      = 0,
+       .last_error    = (time_t)0,
+       .next_check    = (time_t)LONG_MAX,
+};
+
+// Print the message to either syslog or stderr, optionally
+// specifying filename and line number
+static void
+vdisk_print_msg(char *file, int line, char *msg)
+{
+       if (file) {
+               if (vdisk_out_target == VDISK_OUT_SYSLOG) {
+                       syslog(LOG_DEBUG, "%s:%d: %s",
+                              file, line, msg);
+               } else {
+                       fprintf(stderr, "%s:%d: %s",
+                               file, line, msg);
+               }
+       } else {
+               if (vdisk_out_target == VDISK_OUT_SYSLOG) {
+                       syslog(LOG_DEBUG, "%s", msg);
+               } else {
+                       fprintf(stderr, "%s", msg);
+               }
+       }
+}
+
+void
+vdisk_log_error(int level, char *file, int line, char *fmt, ...)
+{
+       int print_msg;
+
+       if (level > vdisk_dbg_level)
+               return;
+                               
+       print_msg = 0;
+
+       // Decide whether to print the message.
+       // Only manage message reporting for level 0, which is
+       // usually reserved for errors. Other messages will be
+       // printed unconditionally.     
+       if (level == 0) {                       
+               time_t now;
+
+               if (time(&now) == (time_t)-1) {
+                       // This should never happen ;-()
+                       vdisk_print_msg(NULL, 0, "vdisk: Can't get time, "
+                                       "error reporting stopped\n");
+                       return; // XXX: Or continue?
+               }
+
+               if (now >= vdisk_log.next_check) {
+
+                       if (now - vdisk_log.last_error > 
+                           (time_t)vdisk_log.interval) {
+                               // reset message throttling
+                               vdisk_log.restart = 0;
+                               vdisk_log.interval = 0;
+                               vdisk_log.vdisk_err_cnt = 0;
+                               vdisk_log.next_check = LONG_MAX;
+                               vdisk_print_msg(NULL, 0, "vdisk: Restoring "
+                                               "error reporting\n");
+                       }
+
+                       if (vdisk_log.restart) {                                
+                               // Double the interval, max at 128 seconds
+                               vdisk_log.interval = (vdisk_log.interval > 64) 
? 
+                                       vdisk_log.interval :
+                                       (vdisk_log.interval * 2);
+                               vdisk_log.next_check += 
+                                       (time_t)vdisk_log.interval;
+                               vdisk_log.restart = 0;
+                       } 
+                
+                       vdisk_log.vdisk_err_cnt = 0;    
+
+               } else {
+                       // Message received during throttling interval.
+                       // We will need to double the interval later
+                       vdisk_log.restart = 1;
+               }
+
+               if (vdisk_log.vdisk_err_cnt < VDISK_MAX_ERRORS) {
+                       vdisk_log.vdisk_err_cnt++;
+                       print_msg = 1;
+               }                       
+
+               if (vdisk_log.vdisk_err_cnt == VDISK_MAX_ERRORS) {
+                       vdisk_log.vdisk_err_cnt++;
+                       if (vdisk_log.interval == 0) {
+                               // Start interval management
+                               vdisk_print_msg(NULL, 0, "vdisk: Too many "
+                                               "errors, slowing down rate "
+                                               "of reporting\n");
+                               vdisk_log.interval = 1;
+                               vdisk_log.next_check = now + 
+                                       (time_t)vdisk_log.interval;
+                       }
+               } 
+
+               vdisk_log.last_error = now;
+
+       } else
+               print_msg = 1;
+
+
+       if (print_msg) {
+               va_list args;
+               
+               // Roll arguments into a string
+               va_start(args, fmt);
+               (void)vsnprintf(argstring, VDISK_ERR_STRING_LEN,
+                               fmt, args);
+               va_end(args);
+
+               vdisk_print_msg(file, line, argstring);
+       }
+}
+
+void
+vdisk_alloc_init(void *alloc_func, void *free_func)
+{
+       if (alloc_func != NULL)
+               vdisk_alloc_func = alloc_func;
+
+       if (free_func != NULL)
+               vdisk_free_func = free_func;
+}
+
+void *
+vdisk_malloc(size_t sz)
+{
+       void *ptr;
+
+       ptr = vdisk_alloc_func(sz);
+       if (ptr)
+               memset(ptr, 0, sz);
+       return (ptr);
+}
+
+void
+vdisk_free(void *ptr)
+{
+       vdisk_free_func(ptr);
+       ptr = NULL;
+}
+
+int
+vdisk_close(int fp)
+{
+       int err;
+
+       err = fsync(fp);
+       if (err)
+               VIDDBG(0, "fsync(): %s\n", strerror(errno));
+
+       // Invalidate all pages from page cache
+       err = posix_fadvise(fp, 0, 0, POSIX_FADV_DONTNEED);
+       if (err)
+               VIDDBG(0, "posix_fadvise(): %s\n", strerror(errno));
+
+       err = close(fp);
+       return (err);
+}
+
+size_t
+vdisk_size(int f, size_t *sz)
+{
+       size_t cur;
+       int err;
+
+       /*
+        * XXX: Obviously, we should use fstat(). Unfortunately, I couldn't 
+        * figure out how to make a dynamic library that calls fstat. 
+        * See glibc FAQ for descritpion of *problem* (why couldn't they
+        * provide a solution as well?)
+        */
+
+       // Remember current position
+       cur = lseek(f, 0, SEEK_CUR);
+       if (cur == -1) {
+               err = errno;
+               VIDDBG(0, "lseek: Can't seek to current: %s\n", 
strerror(errno));
+               return (err);
+       }
+
+       *sz = lseek(f, 0, SEEK_END);
+       if (*sz == -1) {
+               err = errno;
+               VIDDBG(0, "lseek: Can't seek to end: %s\n", strerror(errno));
+               return (err);
+       }
+
+       // Restore current position
+       cur = lseek(f, 0, SEEK_SET);
+       if (cur == -1) {
+               err = errno;
+               VIDDBG(0, "lseek: Can't seek to current: %s\n", 
strerror(errno));
+               return (err);
+       }
+       
+       return (0);
+}
+
+size_t
+vdisk_asyncio(vdisk_dev_t *vdisk, uint64_t block, 
+             int fp, char *buf, 
+             size_t size, off_t off, 
+             void *arg, void *aiocb,
+             int op)
+{
+       int hash_index = VDISK_HASH_IDX(block);
+       struct iocb *io;
+       struct pending_aio *pio;
+
+
+       ASSERT(vdisk->aio_cnt < VDISK_HASH_SZ);
+       ASSERT(vdisk->hash[hash_index].key == block);
+
+       io = &vdisk->hash[hash_index].io;
+       pio = &vdisk->hash[hash_index].pio;
+       
+       pio->arg = arg;
+       pio->block = block;
+       pio->aiocb = aiocb;
+       pio->num_blocks = size>>9;
+       pio->off = off;
+       pio->fd = fp;
+       pio->op = op;
+
+       if (op == VDISK_WRITE)
+               io_prep_pwrite(io, fp, buf, size, off);
+       else
+               io_prep_pread(io, fp, buf, size, off);
+
+       io->data = pio;
+
+       VIDDBG(50, "Using hash entry %d (block %d)\n", 
+              VDISK_HASH_IDX(pio->block), pio->block);
+
+       vdisk->aio_submit[vdisk->aio_cnt++] = io;
+
+       return (size);
+}
+
+static void
+vdisk_manage_pcache(int fp, vdisk_syncio_t *syncio, off_t start, off_t len)
+{
+#define        WRITE_RUN       (1<<22) //4MB
+       int res;
+       DO_STATS(time_t now);
+
+       DO_STATS(++(syncio->total_writes));
+
+       if (syncio->is_set) {
+               if (start >= syncio->io_start &&
+                   start <= syncio->io_start + syncio->io_len) {
+                       syncio->io_len -= (syncio->io_start + 
+                                          syncio->io_len) - start;
+                       syncio->io_len += len;
+                       DO_STATS(++(syncio->contig_writes));
+                       if (syncio->io_len > WRITE_RUN) {
+                               DO_STATS(++(syncio->flush_size_force));
+
+                               syncio->is_set = 0;
+                               
+                               res = fsync(fp);
+                               if (res)
+                                       VIDDBG(0, "fsync: %s\n",
+                                              strerror(errno));
+                               
+                               res = posix_fadvise(fp, syncio->io_start,
+                                                   syncio->io_len, 
+                                                   POSIX_FADV_DONTNEED);
+                               if (res)
+                                       VIDDBG(0, "posix_fadvise: %s\n", 
+                                              strerror(errno));
+                       }
+                       len = 0; // NOTE:len is consumed into previous
+               } else {
+#if VDISK_SYNCIO_STATS
+                       if (syncio->io_len < (1<<20))
+                               ++(syncio->flush_size_sub1MB);
+                       else if (syncio->io_len < (1<<21))
+                               ++(syncio->flush_size_sub2MB);
+                       else if (syncio->io_len < (1<<22))
+                               ++(syncio->flush_size_sub4MB);
+                       else if (syncio->io_len < (1<<23))
+                               ++(syncio->flush_size_sub8MB);
+                       else
+                               ++(syncio->flush_size_ovr8MB);
+#endif /* VDISK_SYNCIO_STATS */
+                       syncio->is_set = 0;
+                       res = fsync(fp);
+                       if (res)
+                               VIDDBG(0, "fsync: %s\n", strerror(errno));
+                       res = posix_fadvise(fp, syncio->io_start,
+                                           syncio->io_len, 
+                                           POSIX_FADV_DONTNEED);
+                       if (res)
+                               VIDDBG(0, "posix_fadvise: %s\n", 
+                                      strerror(errno));
+               }
+       }
+       if (len > 0) {
+               if (len <= WRITE_RUN) {
+                       syncio->is_set = 1;
+                       syncio->io_start = start;
+                       syncio->io_len = len;
+               } else {
+                       DO_STATS(++(syncio->flush_size_force));
+                       res = fsync(fp);
+                       if (res)
+                               VIDDBG(0, "fsync: %s\n", strerror(errno));
+                       res = posix_fadvise(fp, start, len, 
+                                           POSIX_FADV_DONTNEED);
+                       if (res)
+                               VIDDBG(0, "posix_fadvise: %s\n",
+                                      strerror(errno));
+               }
+       }
+#if VDISK_SYNCIO_STATS
+       now = time(NULL);
+       if (now >= syncio->last_dbg_print + 60) {
+               VIDDBG(0, ":WRITE_PERF: [%lu] tWrts %lu | conWrts %lu | s1M %lu"
+                      " | s2M %lu | s4M %lu | s8M %lu | o8M %lu | f %lu\n",
+                      (unsigned long)(now - syncio->last_dbg_print),
+                      syncio->total_writes, syncio->contig_writes,
+                      syncio->flush_size_sub1MB, syncio->flush_size_sub2MB,
+                      syncio->flush_size_sub4MB, syncio->flush_size_sub8MB,
+                      syncio->flush_size_ovr8MB, syncio->flush_size_force);
+               syncio->last_dbg_print = now;
+       }
+#endif /* VDISK_SYNCIO_STATS */
+}
+
+size_t
+vdisk_syncio(int fp, char *buf, size_t size, off_t off, int op, 
+            vdisk_syncio_t *syncio)
+{
+       size_t bytes;
+       off_t res;
+       off_t io_start;
+       off_t io_len;
+
+       ASSERT(!(size & 511));
+       ASSERT(!(off & 511));
+       ASSERT(!((addr_t)buf & 511));
+
+       res = vdisk_seek(fp, off, SEEK_SET);
+       if (res != off) {
+               VIDDBG(0, "lseek couldn't set offset to 0x%" PRIx64 ": %s\n",
+                      off, strerror(errno));
+               return (-1);
+       }
+
+       if (op == VDISK_WRITE) {
+               bytes = write(fp, buf, size);
+       } else
+               bytes = read(fp, buf, size);
+
+       if (bytes != size) {
+               VIDDBG(0, "%s %zd bytes instead of %zd: %s\n",
+                      (op == VDISK_WRITE)?"Wrote":"Read",
+                      bytes, size, strerror(errno));
+       } 
+
+       io_start = (off & (~((off_t)vdisk_pagesz-1)));
+       io_len = (size + vdisk_pagesz);
+
+       if (op == VDISK_READ) {
+               res = posix_fadvise(fp, io_start, io_len, POSIX_FADV_DONTNEED);
+               if (res)
+                       VIDDBG(0, "posix_fadvise: %s\n", strerror(errno));
+       } else if (syncio) {
+               vdisk_manage_pcache(fp, syncio, io_start, io_len);
+       } else {
+               res = fsync(fp);
+               if (res)
+                       VIDDBG(0, "fsync: %s\n", strerror(errno));
+               res = posix_fadvise(fp, io_start, io_len, POSIX_FADV_DONTNEED);
+               if (res)
+                       VIDDBG(0, "posix_fadvise: %s\n", strerror(errno));
+       }
+
+       return (bytes); 
+}
diff -r 75c61490cc06 tools/vdisk/vdisk_utils.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/vdisk/vdisk_utils.h Thu Jun 21 13:05:31 2007 -0400
@@ -0,0 +1,36 @@
+// Copyright (c) 2003-2007, Virtual Iron Software, Inc.
+//
+// Portions have been modified by Virtual Iron Software, Inc.
+// (c) 2007. This file and the modifications can be redistributed and/or
+// modified under the terms and conditions of the GNU General Public
+// License, version 2.1 and not any later version of the GPL, as published
+// by the Free Software Foundation.
+
+#ifndef __VDISK_UTILS
+#define __VDISK_UTILS
+
+
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include "vdisk.h"
+
+
+
+#define vdisk_open(cp, fl, mode) open((cp), (fl), (mode))
+#define vdisk_seek(fp, off, whence) lseek64((fp), (off), (whence))
+
+extern void *vdisk_malloc(size_t sz);
+extern void vdisk_free(void *ptr);
+extern int vdisk_close(int fp);
+extern size_t vdisk_size(int f, size_t *sz);
+extern size_t vdisk_syncio(int fp, char *buf, size_t sz, loff_t off, 
+                          int op, vdisk_syncio_t *syncio);
+extern size_t vdisk_asyncio(vdisk_dev_t *, uint64_t, int, char *, size_t, 
+                           loff_t, void *, void *, int);
+
+
+#endif /* __VDISK_UTILS */
diff -r 75c61490cc06 tools/vdisk/vhd.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/vdisk/vhd.c Thu Jun 21 13:05:31 2007 -0400
@@ -0,0 +1,925 @@
+// Copyright (c) 2003-2007, Virtual Iron Software, Inc.
+//
+// Portions have been modified by Virtual Iron Software, Inc.
+// (c) 2007. This file and the modifications can be redistributed and/or
+// modified under the terms and conditions of the GNU General Public
+// License, version 2.1 and not any later version of the GPL, as published
+// by the Free Software Foundation.
+
+#define _GNU_SOURCE // for O_DIRECT
+#include <stdio.h>
+#include <stdlib.h> 
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include "list.h"
+#include "vdisk.h"
+#include "vdisk_utils.h"
+#include "vhd.h"
+#include "vhd_footer.h"
+
+char __vhd_zeroes[VHD_FTR_SZ+512];
+char *vhd_zeroes;
+
+#define BLOCK_MASK (~(((addr_t)1<<9)-1))
+
+
+int
+vhd_verify_metadata(vd_file_t *vf)
+{
+       // XXX: Something more robust, maybe?
+       return (0);
+}
+
+int
+vhd_read_footer(vd_file_t *vf)
+{
+       off_t ftr_off, res;
+       vhd_file_t *vhd = vf->vdf;
+       size_t bytes;
+
+       if (vhd->ftr_mem == NULL) {
+               vhd->ftr_mem = vdisk_malloc(VHD_FTR_SZ+512);
+               if (vhd->ftr_mem == NULL) {
+                       VIDDBG(0, "Couldn't allocate dynamic header\n");
+                       return (ENOMEM);
+               }
+               vhd->ftr = vhd->ftr_mem;
+               while ((addr_t)vhd->ftr & 511) vhd->ftr++; 
+       }
+
+       /* Find file size (seek to the end) */
+       res = vdisk_seek(vf->fd, 0, SEEK_END);
+       if (res == -1) {
+               VIDDBG(0, "lseek couldn't set offset to end of file\n");
+               vdisk_free(vhd->ftr_mem);
+               return (-1);
+       }
+
+       ftr_off = res - 512;
+
+       res = vdisk_seek(vf->fd, ftr_off, SEEK_SET);
+       if (res != ftr_off) {
+               VIDDBG(0, "lseek couldn't set offset to 0x%" PRIx64 "\n",
+                      ftr_off);
+               vdisk_free(vhd->ftr_mem);
+               return (-1);
+       }
+
+       if ((bytes = vdisk_syncio(vf->fd, vhd->ftr, 512, ftr_off, 
+                                 VDISK_READ, NULL)) != 512) {
+               VIDDBG(0, "vdisk read from offset 0x%" PRIx64 " failed "
+                      "(read %zd insted of 512) %d\n", 
+                      ftr_off, bytes, errno);
+               vdisk_free(vhd->ftr_mem);
+               return (-1);
+       }
+
+       return 0;
+}
+
+int
+vhd_read_dynhdr(vd_file_t *vf)
+{
+       off_t res;
+       vhd_file_t *vhd = vf->vdf;
+       size_t bat_sz;
+       int err = 0;
+
+
+       vhd->dhdr_mem = vdisk_malloc(VHD_DHDR_SZ+512);
+       if (vhd->dhdr_mem == NULL) {
+               VIDDBG(0, "Couldn't allocate dynamic header\n");
+               err = ENOMEM;
+               goto fail;
+       }
+
+       vhd->dhdr = vhd->dhdr_mem;
+       while ((addr_t)vhd->dhdr & 511) vhd->dhdr++; 
+
+       // Skip  copy of the footer
+       res = vdisk_seek(vf->fd, VHD_FTR_SZ, SEEK_SET);
+       if (res != VHD_FTR_SZ) {
+               VIDDBG(0, "Couldn't skip copy of the footer\n");
+               err = -1;
+               goto fail;
+       }
+       
+       if (vdisk_syncio(vf->fd, vhd->dhdr, VHD_DHDR_SZ, VHD_FTR_SZ, 
VDISK_READ, NULL)
+           != VHD_DHDR_SZ) {
+               VIDDBG(0, "Failed to read dynamic header");
+               err = -1;
+               goto fail;
+       }
+
+       // Read BAT (in 512B units)
+       // XXX: May need to only keep a part of BAT due to memory size concerns
+       bat_sz = vhd_get_dhdr_tbl_entries(vhd->dhdr) << 2;
+       if (bat_sz & 511)
+               bat_sz += (512-(bat_sz & 511));
+
+       vhd->bat_mem = vdisk_malloc(bat_sz+512);
+       if (vhd->bat_mem == NULL) {
+               VIDDBG(0, "Couldn't allocate BAT\n");
+               err = ENOMEM;
+               goto fail;
+       }
+       vhd->bat = vhd->bat_mem;
+       while ((addr_t)vhd->bat & 511) vhd->bat++; 
+
+       if (vdisk_syncio(vf->fd, (char *)vhd->bat, bat_sz, 
+                        VHD_DHDR_SZ+VHD_FTR_SZ, VDISK_READ, NULL) != bat_sz) {
+               VIDDBG(0, "Failed to read BAT");
+               err = -1;
+               goto fail;
+       }
+
+       return (0);
+
+fail:
+       return (err);
+}
+
+int
+vhd_read_metadata(vdisk_dev_t *vdisk, vd_file_t *vf)
+{
+       int err;
+       int type;
+       vhd_file_t *vhd = NULL;
+       int secs_per_block;
+       uint32_t geom;
+
+       vf->vdf = (vhd_file_t *)vdisk_malloc(sizeof(vhd_file_t));
+       if (vf->vdf == NULL) {
+               VIDDBG(0, "Couldn't allocate format-specific data\n");
+               err = ENOMEM;
+               goto fail;
+       }
+       
+       vhd = vf->vdf;
+       memset(vhd, 0, sizeof(vhd_file_t));
+
+       err = vhd_read_footer(vf);
+       if (err) {
+               VIDDBG(0, "Couldn't read footer\n");
+               goto fail;
+       }
+
+       vdisk->sz = vhd_get_ftr_orig_sz(vhd->ftr);
+
+       type = vhd_get_ftr_type(vhd->ftr);
+       if ( (type != VHD_TYPE_FIXED) &&
+            (type != VHD_TYPE_DYNAMIC) &&
+            (type != VHD_TYPE_DIFF)){
+               // Return error for VHD_TYPE_NONE as well.
+               VIDDBG(0, "Unsupported VHD file type (%d)\n", type);
+               err = EIO; // XXX: Something else?
+               goto fail;
+       }
+
+       if (type != VHD_TYPE_FIXED) {
+               size_t sz;
+               int i;
+
+               // We should have a dynamic header
+               err = vhd_read_dynhdr(vf);
+               if (err) {
+                       VIDDBG(0, "Couldn't read dynamic header\n");
+                       goto fail;
+               }
+
+               // No fls() in userland, so we do log2 ourselves
+               vhd->sec_per_block_log = 0;
+               secs_per_block = vhd_get_dhdr_blksz(vhd->dhdr) >> 9;
+               while (secs_per_block >>= 1)
+                       vhd->sec_per_block_log++;
+
+               if (type == VHD_TYPE_DYNAMIC) {
+                       // How many sectors are mapped sequentially
+                       vf->batch_sz = (1<<vhd->sec_per_block_log);
+               } else {
+                       // XXX: Need to think about this.
+                       vf->batch_sz = 1;       
+               }
+
+               // bytes for sectormap is ((sectors per block) / 8)
+               vhd->sectormap_sz = (vhd_get_dhdr_blksz(vhd->dhdr) >> 9) >> 3;
+
+               // Align on 512-byte boundary
+               if ((vhd->sectormap_sz == 0) || (vhd->sectormap_sz & 511)) 
+                       vhd->sectormap_sz += 512 - (vhd->sectormap_sz & 511);
+               
+               // First new block will be allocated where the footer
+               // currently is, which is at the end of the file
+               err = vdisk_size(vf->fd, &sz);
+               if (err) {
+                       VIDDBG(0, "Couldn't get file size\n");
+                       goto fail;
+               }
+               vhd->next_block_off = (sz-VHD_FTR_SZ) >> 9;
+
+               // Allocate sectormap buffer
+               vhd->sec_mem = vdisk_malloc(512*2);
+               if (vhd->sec_mem == NULL) {
+                       //XXX: free everything
+                       VIDDBG(0, "Can't allocate sectormap\n");
+                       err = ENOMEM;
+                       goto fail;
+               }
+               vhd->secmap_chunk = vhd->sec_mem;
+               while ((addr_t)vhd->secmap_chunk & 511) vhd->secmap_chunk++;
+
+               // Allocate sectormap cache
+               for (i=0;i<VHD_CACHE_SZ;i++) {
+                       vhd->cache[i].sec_mem = vdisk_malloc(512*2);
+                       if (vhd->cache[i].sec_mem == NULL) {
+                               //XXX: free everything
+                               VIDDBG(0, "Can't allocate sectormap\n");
+                               err = ENOMEM;
+                               goto fail;
+                       }
+                       vhd->cache[i].secmap_chunk = vhd->cache[i].sec_mem;
+                       while ((addr_t)vhd->cache[i].secmap_chunk & 511) 
+                               vhd->cache[i].secmap_chunk++;
+
+                       // Point to sector 0 (or any other sector),
+                       // but make the map empty
+                       vhd->cache[i].first_sector = 0; //VHD_INVALID_SECTOR;
+                       memset(vhd->cache[i].secmap_chunk, 0, 512);
+               }
+
+               if (VHD_CACHE_SZ > 0) {
+                       vhd->cache_head = &vhd->cache[0];
+                       vhd->cache[0].prev = NULL;
+                       for (i=1;i<VHD_CACHE_SZ;i++) {
+                               vhd->cache[i-1].next = &vhd->cache[i];
+                               vhd->cache[i].prev = &vhd->cache[i-1];
+                       }
+                       vhd->cache_tail = &vhd->cache[VHD_CACHE_SZ-1];
+                       vhd->cache[VHD_CACHE_SZ-1].next = NULL;
+               } //else
+               //vhd->cache_head == NULL;
+       } else
+               vf->batch_sz = (1<<30); // (signed) infinity
+
+       vf->flags = 0;
+
+       err = vhd_verify_metadata(vf);
+       if (err) {
+               VIDDBG(0, "File appears to be corrupted\n");
+
+               // XXX: It may be salvageable
+               if (type != VHD_TYPE_FIXED) {
+                       vdisk_free(vhd->dhdr_mem);
+                       vdisk_free(vhd->bat_mem);
+                       vdisk_free(vhd->sec_mem);
+               }
+               err = EIO;
+               goto fail;
+       }
+
+       // We are assuming here that all files of the
+       // vdisk have the same geometry.
+       geom = vhd_get_ftr_geom(vhd->ftr);
+       vdisk->geom.cyls = (geom >> 16) & 0xffff;
+       vdisk->geom.heads = (geom >> 8) & 0xff;
+       vdisk->geom.secs = geom & 0xff;
+
+               return (0);
+
+fail:
+       if (vhd) {
+               if (vhd->ftr_mem)
+                       vdisk_free(vhd->ftr_mem);
+               if (vhd->dhdr_mem)
+                       vdisk_free(vhd->dhdr_mem);
+               if (vhd->bat_mem)
+                       vdisk_free(vhd->bat_mem);
+               if (vhd->sec_mem)
+                       vdisk_free(vhd->sec_mem);
+               vdisk_free(vhd);
+       }
+       return (err);
+}
+
+int
+vhd_alloc_block(vd_file_t *vf, uint32_t blockno)
+{
+       size_t bytes;
+       off_t bat_off;
+       char *ptr;
+       vhd_file_t *vhd = vf->vdf;
+       size_t blocksz;
+
+
+       ASSERT(__arch__swab32(vhd->bat[blockno]) == VHD_BAT_INVALID_ENTRY);
+       ASSERT((vhd_get_dhdr_blksz(vhd->dhdr) & 511) == 0);
+       ASSERT((vhd->sectormap_sz & 511) == 0);
+
+       blocksz = vhd_get_dhdr_blksz(vhd->dhdr) + vhd->sectormap_sz;
+
+       /*
+        * First try to write footer at new position.
+        * The hole should be filled with zeroes
+        * XXX: Are we sure?
+        */
+       bytes = vdisk_syncio(vf->fd, vhd->ftr, VHD_FTR_SZ, 
+                            (vhd->next_block_off<<9) + blocksz,
+                            VDISK_WRITE, NULL);
+       if (bytes != VHD_FTR_SZ) {
+               VIDDBG(0, "Can't append footer\n");
+               return (EIO);
+       }
+       
+
+       // Overwrite footer with zeroes
+       bytes = vdisk_syncio(vf->fd, vhd_zeroes, VHD_FTR_SZ, 
+                            vhd->next_block_off<<9, VDISK_WRITE, NULL);
+       if (bytes != VHD_FTR_SZ) {
+               VIDDBG(0, "Can't overwrite footer\n");
+               return (EIO);
+       }
+
+       // Now update BAT in a 512-b chunk
+       vhd->bat[blockno] = __arch__swab32(vhd->next_block_off);
+       bat_off = (VHD_FTR_SZ + VHD_DHDR_SZ + (blockno<<2)) & BLOCK_MASK;
+       ptr = (char *)(((addr_t)&vhd->bat[blockno]) & BLOCK_MASK);
+       bytes = vdisk_syncio(vf->fd, ptr, 512, bat_off, VDISK_WRITE, NULL);
+       if (bytes != 512) {
+               VIDDBG(0, "Can't update BAT\n");
+               return (EIO);
+       }
+
+       vhd->next_block_off += (blocksz >> 9);
+
+       return(0);
+}
+
+/*
+ * It would be easier to use test_bit()/set_bit() routines,
+ * but x86 bit test/set instructions count bits (in the last byte)
+ * from LSb, which is not what we want. We could recompute pos
+ * (pos=(pos&(~7))+7-(pos&7)) but doing this operation more
+ * explicitely seems to be safer.
+ */
+inline int
+vhd_test_bit(int pos, char *buf)
+{
+       char *addr = (char *)((addr_t)buf + (pos>>3));
+       uint8_t byte = *addr;
+       uint8_t bitinbyte = 7-(pos&7);
+       
+       return (byte & (1<<bitinbyte));
+}
+
+inline int
+vhd_test_bitset(int start, int bits, char *buf)
+{
+       int i;
+
+       for (i=0;i<bits;i++)
+               if (!vhd_test_bit(start+i, buf))
+                       return (0);
+
+       return (1);
+}
+
+inline void
+vhd_set_bit(int pos, char *buf)
+{
+       char *addr = (char *)((addr_t)buf + (pos>>3));
+       uint8_t byte = *addr;
+       uint8_t bitinbyte = 7-(pos&7);
+       
+       *addr = byte | (1<<bitinbyte);
+}
+
+inline void
+vhd_set_bitset(int start, int bits, char *buf)
+{
+       int i;
+
+       for (i=0;i<bits;i++)
+               vhd_set_bit(start+i, buf);
+}
+
+
+int
+vhd_xfer_commit(void *arg, int err)
+{
+       vhd_xfer_t *vhdx = arg;
+       size_t bytes;
+
+       if (arg == NULL)
+               return (0);
+
+       if (err == 0) {
+
+               // Read the 512b chunk of sector map 
+               bytes = vdisk_syncio(vhdx->fd, vhdx->secmap_chunk, 512, 
+                                    vhdx->secmap_addr, VDISK_READ, NULL);
+               if (bytes != 512) {
+                       VIDDBG(0, "Failed to read sector bitmap\n");
+                       vdisk_free(vhdx->mem);
+                       return (EIO);
+               }
+               
+               // Set sector bit
+               vhd_set_bitset(vhdx->sector_bit, vhdx->num_secs, 
+                              vhdx->secmap_chunk);
+               
+               // and write it back
+               bytes = vdisk_syncio(vhdx->fd, vhdx->secmap_chunk, 512, 
+                                    vhdx->secmap_addr, VDISK_WRITE, NULL);
+               if (bytes != 512) {
+                       VIDDBG(0, "Can't commit access\n");
+                       vdisk_free(vhdx->mem);
+                       return (EIO);
+               }
+               
+               if (vhdx->cache && vhdx->first_sector != VHD_INVALID_SECTOR) {
+                       ASSERT(vhdx->cache->first_sector == VHD_INVALID_SECTOR);
+                       memcpy(vhdx->cache->secmap_chunk, vhdx->secmap_chunk, 
512);
+                       vhdx->cache->first_sector = vhdx->first_sector;
+               }               
+       }
+
+       vdisk_free(vhdx->mem);
+       return (0);
+}
+
+// Microsoft uses "sector" for 512-byte unit that we 
+// refer to as "block" elsewhere.
+// This routine is *NOT* SMP-safe!
+int
+vhd_map_block(vd_file_t *vf, 
+             uint32_t *sectorno,      /* IN/OUT */
+             int num_secs,
+             int op,
+             void **arg)       
+{
+       vhd_file_t *vhd = vf->vdf;
+       int type = vhd_get_ftr_type(vhd->ftr);
+       uint32_t blockno; // block of sectors in the file
+       int err;
+       size_t bytes;
+       int sector_bit; // bit offset into 512b chunk of sectormap
+       int sector_in_block;
+       off_t sectormap_addr;
+       uint32_t first_sector;
+       vhd_cache_t *cache = vhd->cache_head;
+
+
+       if (type == VHD_TYPE_FIXED)
+               return (VID_BLOCK_MAPPED);
+
+       vhd->stats.access++;
+       
+       blockno = *sectorno >> vhd->sec_per_block_log;
+       
+       // We can only map sequence on sectors in the same block
+       ASSERT(((*sectorno+num_secs-1) >> vhd->sec_per_block_log)
+              == blockno);
+
+       // First sector in the block (really, blockno<<vhd->sec_per_block_log)
+       first_sector = *sectorno & (~(((uint32_t)1<<vhd->sec_per_block_log)-1));
+
+       // This sector's offset in the block
+       sector_in_block = *sectorno & (((uint32_t)1<<vhd->sec_per_block_log)-1);
+
+       sector_bit = sector_in_block & ((512*8)-1); // 8 bits in a byte
+       while (cache != NULL) {
+               if (cache->first_sector == first_sector) {
+                       // Sectormap is cached
+                       if (vhd_test_bitset(sector_bit, num_secs, 
+                                           cache->secmap_chunk)) {
+                               
+                               // sector is mapped
+                               *sectorno = cache->phys_first_sector +
+                                       + sector_in_block;
+                               
+                               vhd->stats.cache_hit++;
+                               
+                               // Make the line LRU
+                               if (cache->prev) {
+                                       cache->prev->next = cache->next;
+                                       if (cache->next)
+                                               cache->next->prev =
+                                                       cache->prev;
+                                       else
+                                               vhd->cache_tail = cache->prev;
+                                       
+                                       cache->next = vhd->cache_head;
+                                       cache->next->prev = cache;
+                                       cache->prev = NULL;
+                                       vhd->cache_head = cache;
+                               }
+                               
+                               return (VID_BLOCK_MAPPED);
+                       } else {
+                               break;
+                       }
+               }
+               cache = cache->next;
+       }
+       
+       if (__arch__swab32(vhd->bat[blockno]) == VHD_BAT_INVALID_ENTRY) {
+               
+               // For reads, the caller will assume that
+               // read returned zeroes
+               if (op == VDISK_READ)
+                       return (VID_BLOCK_NOTMAPPED);
+               
+               err = vhd_alloc_block(vf, blockno);
+               vhd->stats.block_alloc++;
+               VIDDBG(100, "Allocated block %d\n", blockno);
+               if (err) {
+                       VIDDBG(0, "Failed to allocate block\n");
+                       return (err);
+               }
+       }
+       
+       if (VHD_CACHE_SZ > 0) {
+               
+               if (vhd->cache_tail->first_sector != VHD_INVALID_SECTOR) {
+                       if ((cache == NULL) && (vhd->cache_head != NULL)) {
+                               vhd_cache_t *oldh = vhd->cache_head;
+                               vhd_cache_t *oldt = vhd->cache_tail;
+                               
+                               vhd->cache_head = oldt;
+                               vhd->cache_tail = oldt->prev;
+                               
+                               vhd->cache_head->next = oldh;
+                               oldh->prev = oldt;
+                               
+                               vhd->cache_head->prev = NULL;
+                               
+                               vhd->cache_tail->next = NULL;
+                               
+                               cache = vhd->cache_head;
+                       }
+                       
+               } else {
+                       // tail cache fill is in-flight. We assume that
+                       // all others are in-flight as well.
+                       // We will not be caching
+                       // XXX: we should probably walk the list
+                       //first_sector = VHD_INVALID_SECTOR;
+               }
+       }
+       
+       // Read a block worth of sector bitmap
+       sectormap_addr = 
+               ((uint64_t)__arch__swab32(vhd->bat[blockno])<<9) +
+               ((sector_in_block>>3) & BLOCK_MASK);
+       bytes = vdisk_syncio(vf->fd, vhd->secmap_chunk, 512, 
+                            sectormap_addr, VDISK_READ, NULL);
+       if (bytes != 512) {
+               VIDDBG(0, "Failed to read sector bitmap\n");
+               return (EIO);
+       }
+       
+       // See whether the sector is present
+       if (!vhd_test_bitset(sector_bit, num_secs, vhd->secmap_chunk)) {
+               vhd_xfer_t *vhdx;
+               int byteaddr, bitno;
+               char *ptr;
+               
+               // For reads, the caller will assume that
+               // read returned zeroes
+               if (op == VDISK_READ) {
+                       int i;
+                       int mapped = 0;
+                       
+                       for (i=0; i<num_secs; i++) {
+                               if (vhd_test_bit(sector_bit+i, 
+                                                vhd->secmap_chunk)) {
+                                       mapped = 1;
+                                       break;
+                               }
+                       }
+
+                       if (!mapped) {
+                               // None of blocks is mapped
+                               return (VID_BLOCK_NOTMAPPED);
+                       } else {
+                               // Some blocks are mapped and some are not
+                               return (VID_BLOCK_TOOBIG);
+                       }
+               }
+
+               byteaddr = sector_bit >> 3; // Find word in the map
+               bitno = sector_bit & 7;     // Bit in the word
+               ASSERT(byteaddr<512);
+               
+               // sectormap is the first member and will be aligned
+               vhdx = vdisk_malloc(sizeof(vhd_xfer_t)+512);
+               if (vhdx == NULL) {
+                       VIDDBG(0, "Failed to allocate commit data\n");
+                       return (EIO);
+               }
+               
+               ptr = (char *)vhdx;
+               while ((addr_t)ptr & 511) ptr++;
+               
+               if (((addr_t)ptr - (addr_t)vhdx) >= 512)
+                       VIDDBG(0, "vhdx=%p, ptr=%p\n", vhdx, ptr);
+               
+               ASSERT(((addr_t)ptr - (addr_t)vhdx) < 512);
+               
+               ((vhd_xfer_t *)ptr)->mem = (void *)vhdx;
+               vhdx = (vhd_xfer_t *)ptr;
+               vhdx->fd = vf->fd;
+               vhdx->secmap_addr = sectormap_addr;
+               vhdx->sector_bit = sector_bit;
+               vhdx->num_secs = num_secs;
+               
+               if (VHD_CACHE_SZ > 0) {
+                       //vhdx->cache = &vhd->cache[cache_index];
+                       vhdx->cache = cache;
+                       vhdx->first_sector = first_sector;
+                       if (cache) // Flush old cache entry 
+                               cache->first_sector = VHD_INVALID_SECTOR;
+               } else
+                       vhdx->first_sector = VHD_INVALID_SECTOR;
+               
+               *arg = vhdx;
+               
+               vhd->stats.sec_alloc++;
+               
+       } else {
+               // cache the map
+               if (VHD_CACHE_SZ > 0) {
+                       if (cache && 
+                           (cache->first_sector != VHD_INVALID_SECTOR)) {
+                               memcpy(cache->secmap_chunk, 
+                                      vhd->secmap_chunk, 512);
+                               cache->first_sector = first_sector;
+                       }
+               }
+       }       
+
+       if (cache)
+               cache->phys_first_sector = __arch__swab32(vhd->bat[blockno]) +
+                       (vhd->sectormap_sz >> 9);
+       
+       // Sector in the backing file
+       *sectorno = (__arch__swab32(vhd->bat[blockno])) + sector_in_block 
+               + (vhd->sectormap_sz >> 9);
+       
+
+       return (VID_BLOCK_MAPPED);
+}
+
+void
+vhd_close(struct vdisk_dev *vdisk)
+{
+       struct list_head *ptr, *tmp;
+       vd_file_t *vf;
+       vhd_file_t *vhd;
+       int err;
+
+       if (vdisk == NULL) {
+               VIDDBG(0, "Invalid vdisk pointer\n");
+               return;
+       }
+
+       list_for_each_safe(ptr, tmp, &vdisk->vdf_head) {
+
+               vf = list_entry(ptr, vd_file_t, vdf_list);
+               if (vf == NULL) {
+                       VIDDBG(0, "Invalid vdisk file pointer\n");
+                       return;
+               }
+
+               vhd = vf->vdf;
+               if (vhd) {
+                       VIDDBG(10, "VHD Stats for %s: \n"
+                              "\t accesses:\t%" PRId64 "\n"
+                              "\t cache_hit:\t%" PRId64 "\n"
+                              "\t block_alloc:\t%" PRId64 "\n"
+                              "\t sec_alloc:\t%" PRId64 "\n"
+                              "\t total IOs:\t%" PRId64 "\n"
+                              "\t busy:\t%" PRId64 "\n"
+                              "\t sync:\t%" PRId64 "\n"
+                              "\t async:\t%" PRId64 "\n",
+                              vf->name,
+                              vhd->stats.access, 
+                              vhd->stats.cache_hit,
+                              vhd->stats.block_alloc, 
+                              vhd->stats.sec_alloc,
+                              vdisk->tot_io, 
+                              vdisk->busyio, 
+                              vdisk->syncio, 
+                              vdisk->asyncio);
+
+                       if (vhd->ftr_mem)
+                               vdisk_free(vhd->ftr_mem);
+                       if (vhd->dhdr_mem)
+                               vdisk_free(vhd->dhdr_mem);
+                       if (vhd->bat_mem)
+                               vdisk_free(vhd->bat_mem);
+                       if (vhd->sec_mem)
+                               vdisk_free(vhd->sec_mem);
+                       vdisk_free(vhd);
+               }
+
+               list_del(&vf->vdf_list);
+               
+               err = vdisk_close(vf->fd);
+               if (err)
+                       VIDDBG(0, "close(%s): %s\n", vf->name, strerror(errno));
+
+               vdisk_free(vf);
+
+               if (list_empty(&vdisk->vdf_head))
+                       break;
+       }
+}
+
+
+int vhd_open(struct vdisk_dev *vdisk, char *filename)
+{
+       int ret = 0;
+       int err;
+       vd_file_t *vf, *child_vf = NULL;
+       char *f, *child = NULL;
+       vhd_file_t *vhd;
+       int rw;
+
+       if (vdisk->flags & VDISK_RO)
+               rw = O_RDONLY;
+       else
+               rw = O_RDWR;
+
+       f = (char *)filename;
+       
+       while (f != NULL) { // Read all file associated with this VD file
+               
+               vf = (vd_file_t *)vdisk_malloc(sizeof(vd_file_t));
+               if (vf == NULL) {
+                       VIDDBG(0, "Couldn't allocate vd_file structure\n");
+                       vdisk_free(vdisk);
+                       return (ENOMEM);
+               }
+               memset(vf, 0, sizeof(vd_file_t));
+
+               if (strlen(f) > PATH_MAX) {
+                       strncpy(vf->name, f, PATH_MAX-1);
+                       vf->name[PATH_MAX] = 0;
+               } else
+                       strcpy(vf->name, f);
+
+               vf->fd = vdisk_open(f, rw, 0);
+               if (vf->fd < 0) {
+                       VIDDBG(0, "Failed to open %s\n", f);
+                       vdisk_free(vf);
+                       vhd_close((void *)vdisk);
+                       return (errno);
+               }       
+               err = vhd_read_metadata(vdisk, vf);
+               if (err) {
+                       VIDDBG(0, "Couldn't read metadata for %s\n", f);
+                       vdisk_free(vf);
+                       vhd_close((void *)vdisk);                               
+                       return (err);
+               }
+               
+               
+               if (child_vf == NULL) {
+                       vf->flags |= VDF_LEAF;
+                       rw = O_RDONLY; // for next iteration
+               }
+
+#if 0
+               // If this is a parent, verify paternity
+               if (!vhd_isfamily(vf, child_vf)) {
+                       VIDDBG(0, "%s is not parent of %s\n",
+                              f, child_vf);
+               }
+#endif         
+
+               list_add_tail(&vf->vdf_list, &vdisk->vdf_head);
+               
+               vhd = (vhd_file_t *)(vf->vdf);
+               if (vhd_get_ftr_type(vhd->ftr) == VHD_TYPE_DIFF ) {
+                       int i;
+                       
+                       child = f;
+                       child_vf = vf;
+                       
+                       for (i=0;i<8;i++) {
+                               ple_t ple;
+                               int fd;
+                               
+                               vhd_get_dhdr_ple(vhd->dhdr, &ple, i);
+                               if (ple.code == VHD_DYN_PLE_ABS ||
+                                   ple.code == VHD_DYN_PLE_REL ) {
+                                       f = vhd_get_parent_name(vf, &ple);
+                                       if (f == NULL) {
+                                               VIDDBG(0, "Can't locate parent "
+                                                      "info for %s\n", f);
+                                               ret = EINVAL;
+                                               goto out;
+                                       }
+                                       
+                                       // stat would be better
+                                       fd = open(f, O_RDONLY);
+                                       if (fd < 0) {
+                                               if (errno == ENOENT ||
+                                                   errno == ELOOP ||
+                                                   errno == ENOTDIR ||
+                                                   errno == ENODEV ||
+                                                   errno == EFAULT) {
+                                                       continue;
+                                               } else  {
+                                                       VIDDBG(0, "stat(%s): 
%s\n",
+                                                              f, 
strerror(errno));
+                                                       ret = errno;
+                                                       goto out;
+                                               }
+                                       } else {
+                                               (void)close(fd);
+                                               break;
+                                       }
+                               }
+                       }
+               } else
+                       break;
+       }
+out:
+       return ret;     
+}
+
+uint64_t
+vhd_size(void *hdl)
+{
+       struct vdisk_dev *vdisk = (struct vdisk_dev *)hdl;
+       return (vdisk->sz);
+}
+
+int
+vhd_get_geometry(void *hdl, int *cyls, int *heads, int *secs)
+{
+       struct vdisk_dev *vdisk = (struct vdisk_dev *)hdl;
+       vd_file_t *vf = NULL;
+       struct list_head *ptr;
+       vhd_file_t *vhd;
+       uint32_t geom;
+
+
+       // Assume that the last file (base) has all the info
+       list_for_each(ptr, &vdisk->vdf_head)
+               vf = list_entry(ptr, vd_file_t, vdf_list);
+
+       if (!vf) {
+               VIDDBG(0, "Can't find base file\n");
+               return (-1);
+       }
+
+       vhd = (vhd_file_t *)vf->vdf;
+       if (vhd == NULL) {
+               VIDDBG(0, "Can't find VHD data\n");
+               return (-1);
+       }
+       geom = vhd_get_ftr_geom(vhd->ftr);
+
+       *cyls = (geom >> 16) & 0xffff;
+       *heads = (geom >> 8) & 0xff;
+       *secs = geom & 0xff;
+
+       VIDDBG(10, "geom = 0x%x (0x%x 0x%x 0x%x)\n", geom, *cyls, *heads, 
*secs);
+       
+       return (0);
+}
+
+vdf_data_t vdfd_vhd = {
+       VHD_EXTENSION,
+       vhd_open,
+       vhd_close,
+       vhd_map_block,
+       vhd_xfer_commit,
+       vhd_print_header,
+       vhd_parse_args,
+       vhd_create_vdisk,
+       vhd_modify_vdisk,
+       {NULL,NULL},
+};
+
+void
+vhd_init()
+{
+       vhd_zeroes = __vhd_zeroes;
+       while ((addr_t)vhd_zeroes & 511) vhd_zeroes++;
+
+       vdisk_register(&vdfd_vhd);
+       memset(vhd_zeroes, 0, VHD_FTR_SZ);
+}
+
+void
+vhd_exit()
+{
+       vdisk_unregister(&vdfd_vhd);
+}
diff -r 75c61490cc06 tools/vdisk/vhd.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/vdisk/vhd.h Thu Jun 21 13:05:31 2007 -0400
@@ -0,0 +1,107 @@
+// Copyright (c) 2003-2007, Virtual Iron Software, Inc.
+//
+// Portions have been modified by Virtual Iron Software, Inc.
+// (c) 2007. This file and the modifications can be redistributed and/or
+// modified under the terms and conditions of the GNU General Public
+// License, version 2.1 and not any later version of the GPL, as published
+// by the Free Software Foundation.
+
+#ifndef __VHD_H
+#define __VHD_H
+
+#define VHD_EXTENSION "vhd"
+
+#define VHD_FTR_SZ   (512)
+#define VHD_DHDR_SZ  (1024)
+
+#define VHD_BAT_INVALID_ENTRY (0xffffffff)
+
+#define VHD_CACHE_SZ       (16)
+#define VHD_INVALID_SECTOR (0xffffffff)
+
+//Parent locator entry
+typedef struct ple {
+       uint32_t code;
+       uint32_t data_space;
+       uint32_t data_len;
+       uint32_t reserved; // XXX: do we care?
+       uint64_t data_off;
+} ple_t;
+
+typedef struct vhd_cache {
+       char *secmap_chunk;         // 512b chunk of block's sectormap 
+       char *sec_mem;
+       uint32_t first_sector;      // First sector of the sectormap
+       uint32_t phys_first_sector; // Sector in the file that first_sector
+                                   // maps to
+       struct vhd_cache *next, *prev;
+} vhd_cache_t;
+
+typedef struct vhd_stat {
+       uint64_t access;
+       uint64_t cache_hit;
+       uint64_t block_alloc;
+       uint64_t sec_alloc;
+} vhd_stat_t;
+
+typedef struct vhd_xfer {
+       // sectormap *must* be first member!
+       char secmap_chunk[512];   // 512b chunk of sectormap. 
+       off_t secmap_addr;        // Address of the chunk
+       int sector_bit;           // bit to be set in sectormap chunk
+       int num_secs;
+       vhd_cache_t *cache;
+       int first_sector;
+       file_t fd;
+       void *mem;             // memory for vhd_xfer
+} vhd_xfer_t;
+
+typedef struct vhd_file {
+       char *secmap_chunk;    // 512B-aligned block of sectormap. 
+       char *sec_mem;         // memory for sectormap section
+       char *ftr;             // 512B-aligned footer
+       char *ftr_mem;         // memory for footer
+       char *dhdr;            // 512B-aligned dynamic header
+       char *dhdr_mem;        // memory for dynamic header
+       uint32_t *bat;         // 512B-aligned Block Allocation Table
+       uint32_t *bat_mem;     // memory for BAT
+       vhd_cache_t cache[VHD_CACHE_SZ];
+       vhd_cache_t *cache_head;
+       vhd_cache_t *cache_tail;
+       vhd_stat_t stats;
+       int sec_per_block_log;
+       int sectormap_sz;
+       off_t next_block_off;
+} vhd_file_t;
+
+
+#define VHD_ARG_SZ        (1<<0)
+#define VHD_ARG_TYPE      (1<<1)
+#define VHD_ARG_BLOCKSZ   (1<<2)
+#define VHD_ARG_UUID      (1<<3)
+#define VHD_ARG_TIME      (1<<4)
+#define VHD_ARG_PARENT    (1<<5)
+
+
+typedef struct vhd_args {
+       size_t vhd_sz;
+       uint8_t type;
+       size_t blocksz;
+       uint8_t uuid[16];
+       char *parent;
+       uint64_t args_mask;
+} vhd_args_t;
+
+
+extern vdf_data_t vdfd_vhd;
+extern char *vhd_zeroes; // Just a bunch of zeroes
+
+extern int vhd_print_header(vd_file_t *vf);
+extern int vhd_parse_args(int argc, int operations, char *argv[], void **args);
+extern int vhd_create_vdisk(char *filename, void *args);
+extern int vhd_modify_vdisk(struct vdisk_dev *vdisk, void *args);
+extern char *vhd_get_parent_name(vd_file_t *vf, ple_t *ple);
+extern void vhd_init(void);
+extern void vhd_exit(void);
+
+#endif /* __VHD_H */
diff -r 75c61490cc06 tools/vdisk/vhd_footer.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/vdisk/vhd_footer.h  Thu Jun 21 13:05:31 2007 -0400
@@ -0,0 +1,316 @@
+// Copyright (c) 2003-2007, Virtual Iron Software, Inc.
+//
+// Portions have been modified by Virtual Iron Software, Inc.
+// (c) 2007. This file and the modifications can be redistributed and/or
+// modified under the terms and conditions of the GNU General Public
+// License, version 2.1 and not any later version of the GPL, as published
+// by the Free Software Foundation.
+
+#ifndef __VHD_FOOTER_H
+#define __VHD_FOOTER_H
+
+#include <string.h>
+#include <linux/types.h>
+#include <linux/byteorder/swab.h>
+
+#define VHD_COOKIE (uint64_t) (   (uint64_t)'c'                \
+                              | ((uint64_t)'o'<<(8*1)) \
+                              | ((uint64_t)'n'<<(8*2)) \
+                              | ((uint64_t)'e'<<(8*3)) \
+                              | ((uint64_t)'c'<<(8*4)) \
+                              | ((uint64_t)'t'<<(8*5)) \
+                              | ((uint64_t)'i'<<(8*6)) \
+                              | ((uint64_t)'x'<<(8*7)))
+
+#define VHD_FEATURES_NONE (0)
+#define VHD_FEATURES_TEMP (1)
+#define VHD_FEATURES_RSVD (2) 
+
+#define VHD_FORMAT_VER_1 (0x00010000)
+
+/* data offset for fixed disks */
+#define VHD_FIXED_OFFSET ((uint64_t)-1)
+
+#define VHD_CREATOR_APP ((uint32_t)'v' \
+                        | ((uint32_t)'i'<<8) \
+                        | ((uint32_t)'t'<<16) \
+                        | ((uint32_t)'l'<<24))
+#define VHD_CREATOR_VER_1 (0x00010000)
+
+#define VHD_CREATOR_HOST_OS ((uint32_t)'L' \
+                            | ((uint32_t)'i'<<8) \
+                            | ((uint32_t)'n'<<16) \
+                            | ((uint32_t)'x'<<24))
+
+#define VHD_TYPE_NONE       (0)
+#define VHD_TYPE_FIXED      (2)
+#define VHD_TYPE_DYNAMIC    (3)
+#define VHD_TYPE_DIFF       (4)
+
+
+
+#define VHD_GEOM(c,h,s) { \
+               ASSERT((c<=0xffff) && (h<=0xff) && (s<=0xff)) ; \
+               (s | (h<<8) | (c<<16)); }
+
+
+static inline uint64_t vhd_get_ftr_cookie(char *ftr) {
+       uint64_t tmp = *(uint64_t *)(&ftr[0]);
+       return (tmp);
+}
+static inline void vhd_set_ftr_cookie(char *ftr, uint64_t val) {
+       uint64_t tmp = val;
+       *(uint64_t *)(&ftr[0]) = tmp;
+}
+
+static inline uint32_t vhd_get_ftr_features(char *ftr) {
+       uint32_t tmp = *(uint32_t *)(&ftr[8]);
+       return __arch__swab32(tmp);
+}
+static inline void vhd_set_ftr_features(char *ftr, uint32_t val) {
+       uint32_t tmp = val;
+       *(uint32_t *)(&ftr[8]) = __arch__swab32(tmp);
+}
+
+static inline uint32_t vhd_get_ftr_fformat(char *ftr) {
+       uint32_t tmp = *(uint32_t *)(&ftr[12]);
+       return __arch__swab32(tmp);
+}
+static inline void vhd_set_ftr_fformat(char *ftr, uint32_t val) {
+       uint32_t tmp = val;
+       *(uint32_t *)(&ftr[12]) = __arch__swab32(tmp);
+}
+
+static inline uint64_t vhd_get_ftr_dataoff(char *ftr) {
+       uint64_t tmp = *(uint64_t *)(&ftr[16]);
+       return __arch__swab64(tmp);
+}
+static inline void vhd_set_ftr_dataoff(char *ftr, uint64_t val) {
+       uint64_t tmp = val;
+       *(uint64_t *)(&ftr[16]) = __arch__swab64(tmp);
+}
+
+static inline uint32_t vhd_get_ftr_timestamp(char *ftr) {
+       uint32_t tmp = *(uint32_t *)(&ftr[24]);
+       return __arch__swab32(tmp);
+}
+static inline void vhd_set_ftr_timestamp(char *ftr, uint32_t val) {
+       uint32_t tmp = val;
+       *(uint32_t *)(&ftr[24]) = __arch__swab32(tmp);
+}
+
+static inline uint32_t vhd_get_ftr_cr_app(char *ftr) {
+       uint32_t tmp = *(uint32_t *)(&ftr[28]);
+       return (tmp);
+}
+static inline void vhd_set_ftr_cr_app(char *ftr, uint32_t val) {
+       uint32_t tmp = val;
+       *(uint32_t *)(&ftr[28]) = tmp;
+}
+
+static inline uint32_t vhd_get_ftr_cr_ver(char *ftr) {
+       uint32_t tmp = *(uint32_t *)(&ftr[32]);
+       return __arch__swab32(tmp);
+}
+static inline void vhd_set_ftr_cr_ver(char *ftr, uint32_t val) {
+       uint32_t tmp = val;
+       *(uint32_t *)(&ftr[32]) = __arch__swab32(tmp);
+}
+
+static inline uint32_t vhd_get_ftr_cr_hostos(char *ftr) {
+       uint32_t tmp = *(uint32_t *)(&ftr[36]);
+       return (tmp);
+}
+static inline void vhd_set_ftr_cr_hostos(char *ftr, uint32_t val) {
+       uint32_t tmp = val;
+       *(uint32_t *)(&ftr[36]) = tmp;
+}
+
+static inline uint64_t vhd_get_ftr_orig_sz(char *ftr) {
+       uint64_t tmp = *(uint64_t *)(&ftr[40]);
+       return __arch__swab64(tmp);
+}
+static inline void vhd_set_ftr_orig_sz(char *ftr, uint64_t val) {
+       uint64_t tmp = val;
+       *(uint64_t *)(&ftr[40]) = __arch__swab64(tmp);
+}
+
+static inline uint64_t vhd_get_ftr_cur_sz(char *ftr) {
+       uint64_t tmp = *(uint64_t *)(&ftr[48]);
+       return __arch__swab64(tmp);
+}
+static inline void vhd_set_ftr_cur_sz(char *ftr, uint64_t val) {
+       uint64_t tmp = val;
+       *(uint64_t *)(&ftr[48]) = __arch__swab64(tmp);
+}
+
+static inline uint32_t vhd_get_ftr_geom(char *ftr) {
+       uint32_t tmp = *(uint32_t *)(&ftr[56]);
+       return __arch__swab32(tmp);
+}
+static inline void vhd_set_ftr_geom(char *ftr, uint32_t val) {
+       uint32_t tmp = val;
+       *(uint32_t *)(&ftr[56]) = __arch__swab32(tmp);
+}
+
+static inline uint32_t vhd_get_ftr_type(char *ftr) {
+       uint32_t tmp = *(uint32_t *)(&ftr[60]);
+       return __arch__swab32(tmp);
+}
+static inline void vhd_set_ftr_type(char *ftr, uint32_t val) {
+       uint32_t tmp = val;
+       *(uint32_t *)(&ftr[60]) = __arch__swab32(tmp);
+}
+
+#define VHD_FTR_CHKSUM_OFF (64)
+static inline uint32_t vhd_get_ftr_chksum(char *ftr) {
+       uint32_t tmp = *(uint32_t *)(&ftr[64]);
+       return __arch__swab32(tmp);
+}
+static inline void vhd_set_ftr_chksum(char *ftr, uint32_t val) {
+       uint32_t tmp = val;
+       *(uint32_t *)(&ftr[64]) = __arch__swab32(tmp);
+}
+
+static inline uint8_t *vhd_get_ftr_uid(char *ftr) {
+       return (uint8_t *)&ftr[68];
+}
+static inline void vhd_set_ftr_uid(char *ftr, uint8_t *val) {
+       memcpy(&ftr[68], val, 16);
+}
+
+static inline uint8_t vhd_get_ftr_saved_state(char *ftr) {
+       uint8_t tmp = *(uint8_t *)(&ftr[84]);
+       return (tmp);
+}
+static inline void vhd_set_ftr_saved_state(char *ftr, uint8_t val) {
+       uint8_t tmp = val;
+       *(uint8_t *)(&ftr[84]) = tmp;
+}
+
+
+
+#define VHD_DYN_COOKIE (uint64_t) (   (uint64_t)'c'            \
+                                     | ((uint64_t)'x'<<(8*1))  \
+                                     | ((uint64_t)'s'<<(8*2))  \
+                                     | ((uint64_t)'p'<<(8*3))  \
+                                     | ((uint64_t)'a'<<(8*4))  \
+                                     | ((uint64_t)'r'<<(8*5))  \
+                                     | ((uint64_t)'s'<<(8*6))  \
+                                     | ((uint64_t)'e'<<(8*7)))
+
+#define VHD_DYN_OFFSET      ((uint64_t)-1)
+#define VHD_DYN_HDR_VER_1   (0x00010000)
+
+// Parent locator codes (our own)
+#define VHD_DYN_PLE_ABS   (   (uint64_t)'u'            \
+                             | ((uint64_t)'x'<<(8*1))  \
+                             | ((uint64_t)'n'<<(8*2))  \
+                             | ((uint64_t)'L'<<(8*3)))
+#define VHD_DYN_PLE_REL   (   (uint64_t)'k'            \
+                             | ((uint64_t)'x'<<(8*1))  \
+                             | ((uint64_t)'n'<<(8*2))  \
+                             | ((uint64_t)'L'<<(8*3)))
+
+
+static inline uint64_t vhd_get_dhdr_cookie(char *hdr) {
+       uint64_t tmp = *(uint64_t *)(&hdr[0]);
+       return (tmp);
+}
+static inline void vhd_set_dhdr_cookie(char *hdr, uint64_t val) {
+       uint64_t tmp = val;
+       *(uint64_t *)(&hdr[0]) = tmp;
+}
+
+static inline uint64_t vhd_get_dhdr_dataoff(char *hdr) {
+       uint64_t tmp = *(uint64_t *)(&hdr[8]);
+       return __arch__swab64(tmp);
+}
+static inline void vhd_set_dhdr_dataoff(char *hdr, uint64_t val) {
+       uint64_t tmp = val;
+       *(uint64_t *)(&hdr[8]) = __arch__swab64(tmp);
+}
+
+static inline uint64_t vhd_get_dhdr_tbloff(char *hdr) {
+       uint64_t tmp = *(uint64_t *)(&hdr[16]);
+       return __arch__swab64(tmp);
+}
+static inline void vhd_set_dhdr_tbloff(char *hdr, uint64_t val) {
+       uint64_t tmp = val;
+       *(uint64_t *)(&hdr[16]) = __arch__swab64(tmp);
+}
+
+static inline uint32_t vhd_get_dhdr_hdrver(char *hdr) {
+       uint32_t tmp = *(uint32_t *)(&hdr[24]);
+       return __arch__swab32(tmp);
+}
+static inline void vhd_set_dhdr_hdrver(char *hdr, uint32_t val) {
+       uint32_t tmp = val;
+       *(uint32_t *)(&hdr[24]) = __arch__swab32(tmp);
+}
+
+static inline uint32_t vhd_get_dhdr_tbl_entries(char *hdr) {
+       uint32_t tmp = *(uint32_t *)(&hdr[28]);
+       return __arch__swab32(tmp);
+}
+static inline void vhd_set_dhdr_tbl_entries(char *hdr, uint32_t val) {
+       uint32_t tmp = val;
+       *(uint32_t *)(&hdr[28]) = __arch__swab32(tmp);
+}
+
+static inline uint32_t vhd_get_dhdr_blksz(char *hdr) {
+       uint32_t tmp = *(uint32_t *)(&hdr[32]);
+       return __arch__swab32(tmp);
+}
+static inline void vhd_set_dhdr_blksz(char *hdr, uint32_t val) {
+       uint32_t tmp = val;
+       *(uint32_t *)(&hdr[32]) = __arch__swab32(tmp);
+}
+
+#define VHD_DHDR_CHKSUM_OFF (36)
+static inline uint32_t vhd_get_dhdr_chksum(char *hdr) {
+       uint32_t tmp = *(uint32_t *)(&hdr[36]);
+       return __arch__swab32(tmp);
+}
+static inline void vhd_set_dhdr_chksum(char *hdr, uint32_t val) {
+       uint32_t tmp = val;
+       *(uint32_t *)(&hdr[36]) = __arch__swab32(tmp);
+}
+
+static inline uint8_t *vhd_get_dhdr_puid(char *hdr) {
+       return (uint8_t *)&hdr[40];
+}
+static inline void vhd_set_dhdr_puid(char *hdr,  uint8_t *val) {
+       memcpy(&hdr[40], val, 16);
+}
+
+static inline uint32_t vhd_get_dhdr_ptimestamp(char *hdr) {
+       uint32_t tmp = *(uint32_t *)(&hdr[56]);
+       return __arch__swab32(tmp);
+}
+static inline void vhd_set_dhdr_ptimestamp(char *hdr, uint32_t val) {
+       uint32_t tmp = val;
+       *(uint32_t *)(&hdr[56]) = __arch__swab32(tmp);
+}
+
+static inline void vhd_get_dhdr_ple(char *hdr, ple_t *ple, int idx) {
+       char *tmp = &hdr[576+24*idx];
+       
+       ple->code = __arch__swab32(*(uint32_t *)tmp);
+       ple->data_space = __arch__swab32(*(uint32_t *)(tmp+4));
+       ple->data_len = __arch__swab32(*(uint32_t *)(tmp+8));
+       ple->data_off = __arch__swab64(*(uint64_t *)(tmp+16));
+}
+
+static inline void vhd_set_dhdr_ple(char *hdr, ple_t *ple, int idx) {
+       char *tmp = &hdr[576+24*idx];
+       
+       *(uint32_t *)(tmp) = __arch__swab32(ple->code);
+       *(uint32_t *)(tmp+4) = __arch__swab32(ple->data_space);
+       *(uint32_t *)(tmp+8) = __arch__swab32(ple->data_len);
+       *(uint64_t *)(tmp+16) = __arch__swab64(ple->data_off);
+}
+
+
+
+#endif /* __VHD_FOOTER_H */
diff -r 75c61490cc06 tools/vdisk/vhd_utils.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/vdisk/vhd_utils.c   Thu Jun 21 13:05:31 2007 -0400
@@ -0,0 +1,964 @@
+// Copyright (c) 2003-2007, Virtual Iron Software, Inc.
+//
+// Portions have been modified by Virtual Iron Software, Inc.
+// (c) 2007. This file and the modifications can be redistributed and/or
+// modified under the terms and conditions of the GNU General Public
+// License, version 2.1 and not any later version of the GPL, as published
+// by the Free Software Foundation.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <linux/stddef.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <time.h>
+#include <string.h>
+#include <unistd.h>
+#include <ctype.h>
+
+#include "vdisk.h"
+#include "vdisk_utils.h"
+#include "vhd.h"
+#include "vhd_footer.h"
+
+
+// Taken from Microsoft's VHD spec (hence notations...)
+static uint32_t
+vhd_chs(ssize_t sz)
+{
+       uint32_t totalSectors = (sz >> 9); // Assume sz in whole #sectors 
+       int sectorsPerTrack, heads, cylinderTimesHeads, cylinders;
+
+
+       if (totalSectors > 65535 * 16 * 255)
+               totalSectors = 65535 * 16 * 255;
+
+       if (totalSectors >= 65535 * 16 * 63) {
+               sectorsPerTrack = 255;
+               heads = 16;
+               cylinderTimesHeads = totalSectors / sectorsPerTrack;
+       } else {
+               sectorsPerTrack = 17; 
+               cylinderTimesHeads = totalSectors / sectorsPerTrack;
+
+               heads = (cylinderTimesHeads + 1023) / 1024;
+      
+               if (heads < 4)
+                       heads = 4;
+
+               if (cylinderTimesHeads >= (heads * 1024) || heads > 16) {
+                       sectorsPerTrack = 31;
+                       heads = 16;
+                       cylinderTimesHeads = totalSectors / sectorsPerTrack;    
+               }
+
+               if (cylinderTimesHeads >= (heads * 1024)) {
+                       sectorsPerTrack = 63;
+                       heads = 16;
+                       cylinderTimesHeads = totalSectors / sectorsPerTrack;
+               }
+       }
+       cylinders = cylinderTimesHeads / heads;
+
+       return (VHD_GEOM(cylinders, heads, sectorsPerTrack));
+}
+
+uint32_t
+vhd_chksum(char *ptr, size_t sz, char *excl)
+{
+       uint32_t chksum = 0;
+       int i;
+
+       if (ptr == NULL)
+               return (0);
+
+       for (i=0; i<sz; i++)
+               chksum += (uint8_t)ptr[i];
+
+       if (excl != NULL) {
+               // Subtract 4 bytes of checksum
+               chksum -= (excl[0] + excl[1] + excl[2] + excl[3]);
+       }
+
+       return (~chksum);
+}
+
+
+static char *
+vhd_time(uint32_t *file_time, uint32_t *cur_time, int f2c)
+{
+       time_t tm, tm_1970, tm_2000;
+       char *timestr;
+       struct tm epoch_1970 = {0,0,0,1,0,70,0,0,0};
+       struct tm epoch_2000 = {0,0,0,1,0,100,0,0,0};
+       
+
+       tm_1970 = mktime(&epoch_1970);
+       tm_2000 = mktime(&epoch_2000);
+
+       if (f2c) {
+               if (file_time == NULL) {
+                       VIDDBG(0, "Invalid time\n");
+                       if (cur_time != NULL)
+                               *cur_time = 0;
+                       return NULL;
+               }
+
+               tm = *file_time + (tm_2000-tm_1970);
+               timestr = ctime(&tm);
+               if (timestr == NULL) {
+                       VIDDBG(0, "Couldn't convert time (0x%x)\n", *file_time);
+                       return NULL;
+               }
+               if (cur_time != NULL)
+                       *cur_time = tm;
+               timestr[strlen(timestr)-1] = '\0';
+       } else {
+               if (cur_time == NULL) {
+                       VIDDBG(0, "Invalid time\n");
+                       if (file_time != NULL)
+                               *file_time = 0;
+                       return NULL;
+               }
+               tm = *cur_time - (tm_2000-tm_1970);
+               timestr = ctime(&tm);
+               if (timestr == NULL) {
+                       VIDDBG(0, "Couldn't convert time (0x%x)\n", *cur_time);
+                       return NULL;
+               }
+               if (file_time != NULL)
+                       *file_time = tm;
+               timestr[strlen(timestr)-1] = '\0';
+       }
+
+       return timestr;
+}
+
+
+char *
+vhd_get_parent_name(vd_file_t *vf, ple_t *ple)
+{
+       char *cp, *buf, *pool;
+       int bytes;
+
+       if ((ple == NULL) || (ple->data_len == 0)) {
+               VIDDBG(0, "Invalid data\n");
+               return (NULL);
+       }
+
+       // The file is opened with O_DIRECT, so we need to
+       // align buffer on 512-byte boundary
+       pool = buf = vdisk_malloc(ple->data_space+512);
+       if (buf == NULL) {
+               perror("malloc");
+               return (NULL);
+       }
+       while((addr_t)buf & 511) buf++;
+
+       if (lseek(vf->fd, ple->data_off, SEEK_SET) != ple->data_off) {
+               perror("lseek");
+               vdisk_free(pool);
+               return NULL;
+       }
+
+       bytes = read(vf->fd, buf, (size_t)ple->data_space);
+       if (bytes != ple->data_space) {
+               perror("read");
+               vdisk_free(pool);
+               VIDDBG(0, "fd = %d\n", vf->fd);
+               return NULL;
+       }
+
+       cp = vdisk_malloc(ple->data_len+1);
+       if (cp == NULL) {
+               perror("malloc");
+               vdisk_free(pool);
+               return (NULL);
+       }
+       
+       strncpy(cp, buf, ple->data_len);
+       buf[ple->data_len] = 0;
+       vdisk_free(pool);
+
+       //XXX: for codes W2Ru and W2ku we need to convert from UTF-16 to ASCII
+       return cp;
+}
+
+int
+vhd_print_header(vd_file_t *vf)
+{
+       char *cp;
+       uint64_t v64;
+       uint32_t v32;
+       vhd_file_t *vhd = (vhd_file_t *)(vf->vdf);
+       size_t sz, max_sz;
+       int i;
+       int err;
+
+       // Figure out max file size
+
+       err = vdisk_size(vf->fd, &sz);
+       if (err) {
+               VIDDBG(0, "Couldn't get file size\n");
+               return (err);
+       }
+
+       if (vhd_get_ftr_type(vhd->ftr) == VHD_TYPE_FIXED)
+               max_sz = sz;
+       else {
+               uint64_t unmapped_blocks = 0;
+               size_t new_bytes;
+
+               // Count blocks that haven't been allocated
+               for (i=0; i< vhd_get_dhdr_tbl_entries(vhd->dhdr); i++)
+                       if (__arch__swab32(vhd->bat[i]) == 
+                           VHD_BAT_INVALID_ENTRY)
+                               unmapped_blocks++;
+
+               // XXX: Assume that block size is in 512-byte chunks
+               new_bytes = unmapped_blocks * (vhd->sectormap_sz + 
+                                              vhd_get_dhdr_blksz(vhd->dhdr));
+               max_sz = sz + new_bytes;
+       }
+
+       printf("FILE %s:\n", vf->name);
+       printf("\tMaximum file size:\t0x%016zx\n\n", max_sz);
+
+
+       v64 = vhd_get_ftr_cookie(vhd->ftr);
+       cp = (char *)&v64;
+       printf("\tCookie:\t\t\t0x%016" PRIx64 " (\"%c%c%c%c%c%c%c%c\")\n", v64,
+              cp[0], cp[1], cp[2], cp[3], cp[4], cp[5], cp[6], cp[7]);
+       
+       printf("\tFeatures:\t\t0x%08x\n", vhd_get_ftr_features(vhd->ftr));
+       printf("\tFile format vervion:\t0x%08x\n", 
vhd_get_ftr_fformat(vhd->ftr));
+       printf("\tData Offset:\t\t0x%016" PRIx64 "\n", 
+              vhd_get_ftr_dataoff(vhd->ftr));
+
+       v32 = vhd_get_ftr_timestamp(vhd->ftr);
+       printf("\ttimestamp:\t\t0x%08x (%s)\n", v32, vhd_time(&v32, NULL, 1));
+       
+       printf("\tCreator App:\t\t0x%08x\n", vhd_get_ftr_cr_app(vhd->ftr));
+       printf("\tCreator Ver:\t\t0x%08x\n", vhd_get_ftr_cr_ver(vhd->ftr));
+       printf("\tCreator Host OS:\t0x%08x\n", vhd_get_ftr_cr_hostos(vhd->ftr));
+       printf("\tOriginal size:\t\t0x%016" PRIx64 "\n",
+              vhd_get_ftr_orig_sz(vhd->ftr));
+       printf("\tCurrent size:\t\t0x%016" PRIx64 "\n", 
+              vhd_get_ftr_cur_sz(vhd->ftr));
+       printf("\tGeometry:\t\t0x%08x\n", vhd_get_ftr_geom(vhd->ftr));
+       printf("\tType:\t\t\t0x%08x\n", vhd_get_ftr_type(vhd->ftr));
+       printf("\tChecksum:\t\t0x%08x\n", vhd_get_ftr_chksum(vhd->ftr));
+
+       printf("\tUnique ID:\t\t");
+       cp = (char *)vhd_get_ftr_uid(vhd->ftr);
+       for (i=0;i<16;i++)
+               printf("%02x", (*cp++) & 0xff);
+
+       printf("\n\tSaved state:\t\t0x%08x\n", 
vhd_get_ftr_saved_state(vhd->ftr));
+       if ((vhd_get_ftr_type(vhd->ftr) == VHD_TYPE_DYNAMIC ) ||
+           (vhd_get_ftr_type(vhd->ftr) == VHD_TYPE_DIFF )) {
+               
+               printf(" Dynamic Header:\n");
+               
+               v64 = vhd_get_dhdr_cookie(vhd->dhdr);
+               cp = (char *)&v64;
+               printf("\t Cookie:\t\t0x%016" PRIx64 " 
(\"%c%c%c%c%c%c%c%c\")\n",
+                      v64, cp[0], cp[1], cp[2], cp[3], cp[4], cp[5], cp[6], 
cp[7]);
+               printf("\t Data Offset:\t\t0x%016" PRIx64 "\n",
+                      vhd_get_dhdr_dataoff(vhd->dhdr));
+               printf("\t Table Offset:\t\t0x%016" PRIx64 "\n",
+                      vhd_get_dhdr_tbloff(vhd->dhdr));
+               printf("\t Max Table Entries:\t0x%08x\n", 
+                      vhd_get_dhdr_tbl_entries(vhd->dhdr));
+               printf("\t Block Size:\t\t0x%08x\n", 
+                      vhd_get_dhdr_blksz(vhd->dhdr));
+               printf("\t Checksum:\t\t0x%08x\n", 
+                      vhd_get_dhdr_chksum(vhd->dhdr));
+       }
+
+       if (vhd_get_ftr_type(vhd->ftr) == VHD_TYPE_DIFF ) {
+               
+               printf("\t Parent Unique ID:\t"); 
+               cp = (char *)vhd_get_dhdr_puid(vhd->dhdr);
+               for (i=0;i<16;i++)
+                       printf("%02x", (*cp++) & 0xff);
+               v32 = vhd_get_dhdr_ptimestamp(vhd->dhdr);
+               printf("\n\t Parent Timestamp:\t0x%08x (%s)\n", 
+                      v32, vhd_time(&v32, NULL, 1));
+
+               for (i=0;i<8;i++) {
+                       ple_t ple;
+
+                       vhd_get_dhdr_ple(vhd->dhdr, &ple, i);
+                       if (ple.code != 0) {
+                               printf("\t Parent Locator Entry %d:\n", i);
+                               
+                               cp = (char *)&ple.code;
+                               printf("\t\tPlatform Code:\t0x%08x "
+                                      "(\"%c%c%c%c\")\n", 
+                                      ple.code, cp[3], cp[2], cp[1], cp[0]);
+                               printf("\t\tData Space:\t0x%08x\n",
+                                      ple.data_space);
+                               printf("\t\tData Length:\t0x%08x\n",
+                                      ple.data_len);
+                               printf("\t\tData Offset:\t0x%016" PRIx64" \n",
+                                      ple.data_off);
+                               cp = vhd_get_parent_name(vf, &ple);
+                               if (cp == NULL) {
+                                       VIDDBG(0, "Can't locate parent info "
+                                              "in file\n");
+                                       continue;
+                               }
+                               printf("\t\tParent Locator:\t%s\n", cp);
+                               free(cp);
+                       }
+               }
+       }
+
+       return (0);
+}
+
+int
+vhd_parse_args(int argc, int operations, char *argv[], void **args)
+{
+       char c;
+       int i;
+       extern char *optarg;
+       extern int optind, opterr, optopt;
+       vhd_args_t *vhd_args;
+
+       void vhd_usage() {
+               fprintf(stderr, "VHD-specific options: "
+                       "-S <size(MB)> [-f|-d [-p <parent>]] [-B <size(B)>]"
+                       " [-u UUID] [-t]\n");
+       }
+
+       vhd_args = malloc(sizeof(vhd_args_t));
+       if (vhd_args == NULL) {
+               VIDDBG(0, "Can't allocate arguments\n");
+               return (-1);
+       }
+
+       memset(vhd_args, 0, sizeof(vhd_args_t));
+       vhd_args->type = VHD_TYPE_NONE;
+       vhd_args->blocksz = 0x200000; // 2MB
+
+       while (1) {
+
+               c = getopt(argc, argv, "S:fdstB:u:p:");
+               if (c == -1)
+                       break;
+
+               switch (c) {
+               case 'f':
+                       vhd_args->type = VHD_TYPE_FIXED;
+                       vhd_args->args_mask |= VHD_ARG_TYPE;
+                       break;
+               case 's': // 's' for "sparse"
+                       VIDDBG(0, "'-s' option is obsolete. Use '-d' 
instead\n");
+               case 'd':
+                       vhd_args->type = VHD_TYPE_DYNAMIC;
+                       vhd_args->args_mask |= VHD_ARG_TYPE;
+                       break;
+               case 'S':
+                       vhd_args->vhd_sz = atol(optarg) * 1024 * 1024;
+                       vhd_args->args_mask |= VHD_ARG_SZ;
+                       break;
+               case 't':
+                       vhd_args->args_mask |= VHD_ARG_TIME;
+                       break;
+               case 'p':
+                       vhd_args->args_mask |= VHD_ARG_PARENT;
+                       vhd_args->parent = malloc(strlen(optarg)+1);
+                       if (vhd_args->parent == NULL) {
+                               VIDDBG(0, "Out of memory\n");
+                               goto fail;
+                       }
+                       strncpy(vhd_args->parent, optarg, strlen(optarg)+1);
+                       break;
+               case 'B':
+                       vhd_args->blocksz = atol(optarg);
+                       // Must be in 512 byte chunks
+                       if (vhd_args->blocksz & 511) {
+                               VIDDBG(0, 
+                                      "block size must be divisible by 512\n");
+                               goto fail;
+                       }
+                       vhd_args->args_mask |= VHD_ARG_BLOCKSZ;
+                       break;
+               case 'u':
+                       if ((optarg == NULL) || (strlen(optarg) != 32)) {
+                               VIDDBG(0, "UUID is a 16-byte (32-character)"
+                                      " string\n");
+                               goto fail;
+                       }
+
+                       // Convert UUID characters to hex
+                       for(i=0;i<32;i++) {
+                               uint8_t val;
+
+                               val = optarg[i];
+                               if (!isxdigit(val)) {
+                                       VIDDBG(0, "Invalid character in UUID "
+                                              "string ('%c')\n", optarg[i]);
+                                       free(vhd_args);
+                                       vhd_usage();
+                                       return (-1);
+                               }
+                               if (isalpha(val)) {
+                                       val = tolower(val);
+                                       val -= ('a' - 0xa);
+                               } else
+                                       val -= '0';
+       
+                               // two hex numbers per byte
+                               vhd_args->uuid[i>>1] |= (val << (4*((i&1)^1)));
+                       }
+                       vhd_args->args_mask |= VHD_ARG_UUID;
+                       break;
+               default:
+                       vhd_usage();
+                       goto fail;
+               }
+       }
+
+       if ((vhd_args->parent != NULL) && (vhd_args->type == VHD_TYPE_FIXED)) {
+               VIDDBG(0, "Fixed VHD cannot have a parent\n");
+               goto fail;
+       }
+
+       if (operations & VDISK_OP_CREATE) {
+               if (vhd_args->parent == NULL) {
+                       if ((vhd_args->vhd_sz == 0) ||
+                           (vhd_args->type == VHD_TYPE_NONE))
+                               goto fail;
+               }
+       }
+
+       if (vhd_args->vhd_sz % vhd_args->blocksz) {
+               VIDDBG(0, "File size must be multiple of block size\n");
+               goto fail;
+       }
+
+       if (operations & VDISK_OP_MODIFY) {
+               if (vhd_args->args_mask & VHD_ARG_PARENT) {
+                       if (vhd_args->type == VHD_TYPE_FIXED) {
+                               VIDDBG(0, "Fixed VHDs can't have parents\n");
+                               goto fail;
+                       }
+               }
+               if (vhd_args->args_mask & (VHD_ARG_SZ | VHD_ARG_BLOCKSZ)) {
+                       VIDDBG(0, "Can't modify VHD's size or block size\n");
+                       goto fail;
+               }
+       }
+
+       if (vhd_args->parent != NULL) {
+               vhd_args->type = VHD_TYPE_DIFF;
+               if (vhd_args->args_mask & (VHD_ARG_SZ | VHD_ARG_BLOCKSZ)) {
+                       VIDDBG(0, "Differencing VHD's size and block size "
+                              "are inherited from parent\n");
+                       goto fail;
+               }
+       }
+
+
+       *args = vhd_args;
+       return (0);
+
+fail:
+       if (vhd_args->parent)
+               free(vhd_args->parent);
+       free(vhd_args);
+       vhd_usage();
+       return (-1);
+}
+
+// Store differencing file's parent information
+static int
+vhd_store_parent(int vfd, vhd_file_t *vhd, vhd_file_t *pvhd, 
+                char *parentname, loff_t *data)
+{
+       uint32_t bat_sz;
+       ple_t ple;
+       int i;
+       int err;
+       size_t bytes;
+
+       vhd_set_dhdr_puid(vhd->dhdr, vhd_get_ftr_uid(pvhd->ftr));
+       vhd_set_dhdr_ptimestamp(vhd->dhdr, 
+                               vhd_get_ftr_timestamp(pvhd->ftr));
+       
+       memset(&ple, 0, sizeof(ple_t));
+       for (i=0;i<8;i++)
+               vhd_set_dhdr_ple(vhd->dhdr, &ple, i);
+       
+       if (parentname[0] == '/')
+               ple.code = VHD_DYN_PLE_ABS;
+       else
+               ple.code = VHD_DYN_PLE_REL;
+       
+       // XXX: The spec says this is number of 512b sectors,
+       // but file created by MS's Virtual PC tool seems to
+       // think this is number of bytes, aligned at 512b
+       ple.data_space = (strlen(parentname) + 1 + 512)
+               & (~511);
+       ple.data_len = strlen(parentname) + 1;
+       
+       bat_sz = vhd_get_dhdr_tbl_entries(vhd->dhdr) << 2;
+
+       ple.data_off = VHD_DHDR_SZ + VHD_FTR_SZ +
+               bat_sz + 
+               ((bat_sz & 511) ? (512-(bat_sz&511)) : 0) +
+               512; // XXX: see comment in vhd_create_vdisk()
+       vhd_set_dhdr_ple(vhd->dhdr, &ple, 0);
+
+       // Recalculate checksum
+       vhd_set_dhdr_chksum(vhd->dhdr, 
+                           vhd_chksum(vhd->dhdr, VHD_DHDR_SZ, 
+                                      &vhd->dhdr[VHD_DHDR_CHKSUM_OFF]));
+       
+       if (lseek(vfd, VHD_FTR_SZ, SEEK_SET) !=  VHD_FTR_SZ) {
+               err = errno;
+               VIDDBG(0, "lseek: %s", strerror(err));
+               return (err);
+       }
+       
+       // Write the dynamic header
+       bytes = write(vfd, vhd->dhdr, VHD_DHDR_SZ);
+       if (bytes != VHD_DHDR_SZ) {
+               err = errno;
+               VIDDBG(0, "write: %s", strerror(err));
+               return (err);
+       }
+       
+       // Write PLE
+       bytes = lseek(vfd, ple.data_off, SEEK_SET);
+       if (bytes != ple.data_off) {
+               err = errno;
+               VIDDBG(0, "lseek: %s", strerror(err));
+               return (err);
+       }
+       bytes = write(vfd, parentname, strlen(parentname)+1);
+       if (bytes != strlen(parentname)+1) {
+               err = errno;
+               VIDDBG(0, "write: %s", strerror(err));
+               return (err);
+       }
+
+       if (data != NULL)
+               *data = (loff_t)ple.data_off + (loff_t)ple.data_space;
+
+       return (0);
+}
+
+
+int
+vhd_modify_vdisk(struct vdisk_dev *vdisk, void *args)
+{
+       vhd_args_t *vhd_args = args;
+       vd_file_t *vf = NULL;
+       vhd_file_t *vhd;
+       size_t sz, bytes;
+       int err;
+       int store_footer = 0;
+       struct list_head *ptr;
+       int stop = 0;
+
+
+       // XXX: We always make a single pass
+       list_for_each(ptr, &vdisk->vdf_head) {
+
+               vf = list_entry(ptr, vd_file_t, vdf_list);
+               if ((vf == NULL) || (vf->vdf == NULL)) {
+                       VIDDBG(0, "Can't access vdisk's structures\n");
+                       return (-1);
+               }
+               vhd = (vhd_file_t *)vf->vdf;
+
+               // Close and reopen file (it may have been open O_DIRECT)
+               err = vdisk_close(vf->fd);
+               if (err) {
+                       VIDDBG(0, "Can't close %s:%d\n", vf->name, err);
+                       return (err);
+               }
+
+               vf->fd = open(vf->name, O_RDWR, 0644);
+               if (vf->fd == -1) {
+                       err = errno;
+                       VIDDBG(0, "Can't open %s:%d\n", vf->name, 
strerror(errno));
+                       return (err);
+               }
+
+               // Update UUID
+               if (vhd_args->args_mask & VHD_ARG_UUID) {
+
+                       vhd_set_ftr_uid(vhd->ftr, vhd_args->uuid);
+                       
+                       store_footer = 1;
+                       stop = 1;
+               }
+
+               // Change parent name
+               if (vhd_args->args_mask & VHD_ARG_PARENT) {
+                       vhd_file_t *pvhd;
+                       struct vdisk_dev parent;
+                       vd_file_t *pvf;
+
+                       // Open parent file
+                       err = vdisk_init(&parent, vhd_args->parent, NULL, 0);
+                       if (err) {
+                               VIDDBG(0, "Failed to initialize state for "
+                                      "parent %s\n", vhd_args->parent);
+                               return (err);
+                       }
+                       pvf = list_entry(parent.vdf_head.next, vd_file_t, 
vdf_list);
+                       pvhd = (vhd_file_t *)pvf->vdf;
+
+                       // Update dynamic header and parent data
+                       err = vhd_store_parent(vf->fd, vhd, pvhd,
+                                              vhd_args->parent, NULL);
+                       if (err) {
+                               VIDDBG(0, "Failed to store parent name (%s)\n",
+                                      vhd_args->parent);
+                               vdisk_fini(&parent);
+                               return (err);
+                       }
+                       vdisk_fini(&parent);
+
+                       store_footer = 1;
+                       stop = 1;
+               }
+
+               // Update timestamp
+               if (vhd_args->args_mask & VHD_ARG_TIME) {
+                       uint32_t curtime, ftime;
+                       
+                       curtime = time(NULL);
+                       if (curtime == -1) {
+                               perror("time");
+                               return (errno);
+                       }
+                       (void)vhd_time(&ftime, &curtime, 0);
+                       vhd_set_ftr_timestamp(vhd->ftr, ftime);
+
+                       stop = 1;
+               }
+               
+               // Recompute footer's checksum
+               vhd_set_ftr_chksum(vhd->ftr, 
+                                  vhd_chksum(vhd->ftr, VHD_FTR_SZ, 
+                                             &vhd->ftr[VHD_FTR_CHKSUM_OFF]));
+               
+               // Write the footer back if needed
+               if (store_footer) {
+
+                       err = vdisk_size(vf->fd, &sz);
+                       if (err != 0) {
+                               VIDDBG(0, "Can't determine vdisk's size\n");
+                               return (-1);
+                       }
+
+                       if (lseek(vf->fd, (sz-VHD_FTR_SZ), SEEK_SET) != 
+                           (sz - VHD_FTR_SZ)) {
+                               perror("lseek");
+                               return (errno);
+                       }
+                       bytes = write(vf->fd, vhd->ftr, VHD_FTR_SZ);
+                       if (bytes != VHD_FTR_SZ) {
+                               perror("write");
+                               return (errno);
+                       }                       
+                       
+                       // For non-fixed disks write footer at front as well 
+                       if (vhd_get_ftr_type(vhd->ftr) != VHD_TYPE_FIXED) {
+                               if (lseek(vf->fd, 0, SEEK_SET) != 0) {
+                                       perror("lseek");
+                                       return (errno);
+                               }
+                               bytes = write(vf->fd, vhd->ftr, VHD_FTR_SZ);
+                               if (bytes != VHD_FTR_SZ) {
+                                       perror("write");
+                                       return (errno);
+                               }
+                       }
+               }
+
+               if (stop)
+                       break;
+       }
+
+       if (fsync(vf->fd))
+               VIDDBG(0, "fsync: %s\n", strerror(errno));
+
+       return (0);
+}
+
+int
+vhd_create_vdisk(char *filename, void *args)
+{
+       vhd_args_t *vhd_args = args;
+       vhd_file_t vhd;
+       uint32_t curtime, ftime;
+       int vfd = -1;
+       ssize_t bytes;
+       int i;
+       int err = 0;
+       char *hdr_pool = NULL, *ftr_pool = NULL;
+       struct vdisk_dev parent;
+
+       vfd = open(filename, O_CREAT|O_EXCL|O_RDWR, 0644);
+       if (vfd == -1) {
+               if (errno == EEXIST) {
+                       size_t sz;
+
+                       // File already exists
+                       if (vhd_args->type != VHD_TYPE_FIXED) {
+                               VIDDBG(0, "Raw files can only be converted to "
+                                      "fixed VHD format\n");
+                               return (EINVAL);
+                       }
+
+                       vfd = open(filename, O_RDWR, 0644);
+                       if (vfd == -1) {
+                               err = errno;
+                               VIDDBG(0, "vfd open(%s, O_RDWR) failed: %s\n", 
+                                      filename, strerror(err));
+                               return (err);
+                       }
+
+                       err = vdisk_size(vfd, &sz);
+                       if (err) {
+                               VIDDBG(0, "vdisk_size(%s) failed: %s\n",
+                                      filename, strerror(err));
+                               return (err);
+                       }
+
+                       if (vhd_args->vhd_sz < sz) {
+                               VIDDBG(0, "WARNING: Truncating %s (%ld bytes) "
+                                      "to %ld bytes\n", 
+                                      filename, sz, vhd_args->vhd_sz);
+
+                               err = ftruncate(vfd, vhd_args->vhd_sz);
+                               if (err == -1) {
+                                       err = errno;
+                                       VIDDBG(0, "ftruncate(%s, %ld): %s\n",
+                                              filename, vhd_args->vhd_sz, 
+                                              strerror(err));
+                               return (err);   
+                               }
+                       }
+               } else {
+                       err = errno;
+                       VIDDBG(0, "vfd open(%s, O_CREAT|O_EXCL|O_RDWR) "
+                              "failed: %s\n", filename, strerror(err));
+                       return (err);
+               }
+       }
+       
+       parent.vdfd = NULL; 
+
+       memset((char *)&vhd, 0, sizeof(vhd));
+       ftr_pool = vhd.ftr = vdisk_malloc(VHD_FTR_SZ+512);
+       if (vhd.ftr == NULL) {
+               VIDDBG(0, "Couldn't allocate VHD footer\n");
+               close(vfd);
+               return (ENOMEM);
+       }
+       while ((addr_t)vhd.ftr & 511) vhd.ftr++; 
+
+       vhd_set_ftr_cookie(vhd.ftr, VHD_COOKIE);
+       vhd_set_ftr_features(vhd.ftr, VHD_FEATURES_RSVD);
+       vhd_set_ftr_fformat(vhd.ftr, VHD_FORMAT_VER_1);
+       vhd_set_ftr_type(vhd.ftr, vhd_args->type);
+       
+       curtime = time(NULL);
+       if (curtime == -1) {
+               err = errno;
+               perror("time");
+               goto out;
+       }
+       (void)vhd_time(&ftime, &curtime, 0);
+       vhd_set_ftr_timestamp(vhd.ftr, ftime);
+
+       vhd_set_ftr_cr_app(vhd.ftr, VHD_CREATOR_APP);
+       vhd_set_ftr_cr_ver(vhd.ftr, VHD_CREATOR_VER_1);
+       vhd_set_ftr_cr_hostos(vhd.ftr, VHD_CREATOR_HOST_OS);
+       vhd_set_ftr_orig_sz(vhd.ftr, vhd_args->vhd_sz);
+       vhd_set_ftr_cur_sz(vhd.ftr, vhd_args->vhd_sz);
+       vhd_set_ftr_geom(vhd.ftr, vhd_chs(vhd_args->vhd_sz));
+
+       vhd_set_ftr_uid(vhd.ftr, vhd_args->uuid);
+
+       if (vhd_args->type == VHD_TYPE_FIXED)
+               vhd_set_ftr_dataoff(vhd.ftr, VHD_FIXED_OFFSET);
+       else if ((vhd_args->type == VHD_TYPE_DYNAMIC) || 
+                (vhd_args->type == VHD_TYPE_DIFF))
+               vhd_set_ftr_dataoff(vhd.ftr, VHD_FTR_SZ);
+       else
+               ASSERT(0);
+
+       vhd_set_ftr_chksum(vhd.ftr, vhd_chksum(vhd.ftr, VHD_FTR_SZ, 
+                                              &vhd.ftr[VHD_FTR_CHKSUM_OFF]));  
+
+       // Create dynamic header
+       if ((vhd_args->type == VHD_TYPE_DYNAMIC) || 
+           (vhd_args->type == VHD_TYPE_DIFF)) {
+
+               uint32_t bat_entry, bat_sz;
+               loff_t data;
+               vhd_file_t *pvhd = NULL;
+               vd_file_t *pvf;
+
+               if (vhd_args->type == VHD_TYPE_DIFF) {
+                       // Read parent data
+                       err = vdisk_init(&parent, vhd_args->parent, NULL, 0);
+                       if (err) {
+                               VIDDBG(0, "Failed to initialize state for "
+                                      "parent %s\n", vhd_args->parent);
+                               return (err);
+                       }
+                       pvf = list_entry(parent.vdf_head.next, 
+                                        vd_file_t, vdf_list);
+                       pvhd = (vhd_file_t *)pvf->vdf;
+
+                       // Update footer fields inherited from parent
+                       vhd_set_ftr_orig_sz(vhd.ftr, 
+                                           vhd_get_ftr_orig_sz(pvhd->ftr));
+                       vhd_set_ftr_cur_sz(vhd.ftr, 
+                                          vhd_get_ftr_cur_sz(pvhd->ftr));
+                       vhd_set_ftr_geom(vhd.ftr, 
+                                        vhd_get_ftr_geom(pvhd->ftr));
+
+                       vhd_args->vhd_sz = vhd_get_ftr_cur_sz(vhd.ftr);
+               }
+
+               hdr_pool = vhd.dhdr = vdisk_malloc(VHD_DHDR_SZ+512);
+               if (vhd.dhdr == NULL) {
+                       vdisk_free(ftr_pool);
+                       VIDDBG(0, "Couldn't allocate dynamic header\n");
+                       err = ENOMEM;
+                       goto out;
+               }
+               while ((addr_t)vhd.dhdr & 511) vhd.dhdr++; 
+
+               vhd_set_dhdr_cookie(vhd.dhdr, VHD_DYN_COOKIE);
+               vhd_set_dhdr_dataoff(vhd.dhdr, VHD_DYN_OFFSET);
+               vhd_set_dhdr_tbloff(vhd.dhdr, VHD_FTR_SZ+VHD_DHDR_SZ);
+               vhd_set_dhdr_hdrver(vhd.dhdr, VHD_DYN_HDR_VER_1);
+               vhd_set_dhdr_tbl_entries(vhd.dhdr,
+                                        vhd_args->vhd_sz/vhd_args->blocksz);
+               vhd_set_dhdr_blksz(vhd.dhdr, vhd_args->blocksz);
+
+               vhd_set_dhdr_chksum(vhd.dhdr, 
+                                   vhd_chksum(vhd.dhdr, VHD_DHDR_SZ, 
+                                              &vhd.dhdr[VHD_DHDR_CHKSUM_OFF]));
+
+               // Write the copy of the footer first
+               bytes = write(vfd, vhd.ftr, VHD_FTR_SZ);
+               if (bytes != VHD_FTR_SZ) {
+                       perror("write");
+                       err = errno;
+                       goto out;
+               }
+
+               // Write the dynamic header
+               bytes = write(vfd, vhd.dhdr, VHD_DHDR_SZ);
+               if (bytes != VHD_DHDR_SZ) {
+                       perror("write");
+                       vdisk_free(vhd.dhdr);
+                       close(vfd);
+                       return (errno);
+               }
+
+               // Initialize BAT
+               // XXX: Make it faster perhaps?
+               bat_entry = VHD_BAT_INVALID_ENTRY;
+               for (i=0; i< vhd_get_dhdr_tbl_entries(vhd.dhdr); i++) {
+                       bytes = write(vfd, &bat_entry, 4);
+                       if (bytes != 4) {
+                               err = errno;
+                               perror("write");
+                               goto out;
+                       }
+               }
+
+               // BAT must end on sector boundary (512 bytes)
+               bat_entry = 0;
+               bat_sz = vhd_get_dhdr_tbl_entries(vhd.dhdr) << 2;
+               if (bat_sz & 511) {
+                       for (i=0; i<512-(bat_sz&511);i++) {
+                               // Write 1 byte at a time
+                               bytes = write(vfd, &bat_entry, 1);
+                               if (bytes != 1) {
+                                       perror("write");
+                                       err = errno;
+                                       goto out;
+                               }
+                       }
+               }
+
+               // XXX: It appears that there is a 512B block
+               // at the end of BAT, which is not mentioned in the spec
+               for (i=0; i<512>>2; i++) {
+                       bytes = write(vfd, &bat_entry, 4);
+                       if (bytes != 4) {
+                               err = errno;
+                               perror("write");
+                               goto out;
+                       }
+               }
+
+               if (vhd_args->type == VHD_TYPE_DIFF) {
+                       // This will store dynamic header again, but that's OK
+                       err = vhd_store_parent(vfd, &vhd, pvhd,
+                                              vhd_args->parent, &data);
+                       if (err) {
+                               VIDDBG(0, "Failed to store parent name (%s)\n",
+                                      vhd_args->parent);
+                               return (err);
+                       }
+
+                       bytes = lseek(vfd, data, SEEK_SET);
+                       if (bytes != data) {
+                               err = errno;
+                               perror("lseek");
+                               goto out;
+                       }
+               }
+       } else {
+               // for fixed disk, seek to the end of the file
+               if (lseek(vfd, vhd_args->vhd_sz, SEEK_SET) != 
+                   vhd_args->vhd_sz) {
+                       perror("lseek");
+                       err = errno;
+                       goto out;
+               }
+       }
+
+       // Write footer. For fixed disks allocate whole filesize
+       bytes = write(vfd, vhd.ftr, VHD_FTR_SZ);
+       if (bytes != VHD_FTR_SZ) {
+               perror("write");
+               err = errno;
+               goto out;
+       }
+
+out:
+       if (parent.vdfd != NULL)
+               vdisk_fini(&parent);
+
+       if (ftr_pool)
+               vdisk_free(ftr_pool);
+       if (hdr_pool)
+               vdisk_free(hdr_pool);
+       if (vfd != -1) {
+               if (fsync(vfd))
+                       VIDDBG(0, "fsync: %s\n", strerror(errno));
+               close(vfd);
+       }
+
+       return (err);
+}

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-devel] [PATCH 4/4] (Refactored) Add libvdisk, and vdisk_tool, Ben Guthro <=