WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

Re: [Xen-devel] [RFC][PATCH] Use ioemu block drivers through blktap

To: Konrad Rzeszutek <konrad@xxxxxxxxxxxxxxx>
Subject: Re: [Xen-devel] [RFC][PATCH] Use ioemu block drivers through blktap
From: Kevin Wolf <kwolf@xxxxxxx>
Date: Fri, 14 Mar 2008 10:37:48 +0100
Cc: xen-devel@xxxxxxxxxxxxxxxxxxx
Delivery-date: Fri, 14 Mar 2008 02:36:10 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
In-reply-to: <20080313142108.GA3294@xxxxxxxxxxxxxxxxxxxx>
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
References: <47D569F6.1010209@xxxxxxx> <47D91D3D.1000005@xxxxxxx> <20080313142108.GA3294@xxxxxxxxxxxxxxxxxxxx>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
User-agent: Thunderbird 1.5.0.8 (X11/20060911)
Hi Konrad,

first of all, thank you for your review. You noticed quite a few points
I never really looked at because I inherited them from the current
tapdisk code. But probably I should fix these issues as well. ;-)

Konrad Rzeszutek schrieb:
>> +    blkif->fds[READ] = open_ctrl_socket(wrctldev);
>> +    blkif->fds[WRITE] = open_ctrl_socket(rdctldev);
> 
> How about freeing the data here once?
> 
>> +    
>> +    if (blkif->fds[READ] == -1 || blkif->fds[WRITE] == -1) {
>> +            free(rdctldev);
>> +            free(wrctldev);
> 
> And then this is not needed.
> 
>> +            return -1;
>> +    }
>> +
>> +    DPRINTF("Attached to qemu blktap pipes\n");
>> +    free(rdctldev);
>> +    free(wrctldev);
> 
> Nor these two lines above.

Hmm, good point. This code looks a bit silly... Will move the free to
the place you suggested.

>> --- a/tools/python/xen/xend/server/BlktapController.py       Mon Mar 10 
>> 22:51:57 2008 +0000
>> +++ b/tools/python/xen/xend/server/BlktapController.py       Thu Mar 13 
>> 13:00:18 2008 +0100
>> @@ -13,7 +13,9 @@ blktap_disk_types = [
>>      'vmdk',
>>      'ram',
>>      'qcow',
>> -    'qcow2'
>> +    'qcow2',
>> +
>> +    'ioemu'
> 
> Why add the extra \n ?

I wanted to separate the ioemu pseudo driver (which is the only one that
doesn't go through tapdisk) from the "real" tapdisk drivers.

>> +static struct td_state *state_init(void)
>> +{
>> +    int i;
>> +    struct td_state *s;
>> +    blkif_t *blkif;
>> +
>> +    s = malloc(sizeof(struct td_state));
> 
> Would it make sense to zero out the allocated memory?

This code comes directly from tapdisk and it worked there. On the other
hand, it certainly wouldn't hurt.

>> +                    switch (req->operation) 
>> +                    {
>> +                    case BLKIF_OP_WRITE:
>> +                            aiocb_info = malloc(sizeof(*aiocb_info));
>> +
>> +                            aiocb_info->s = s;
>> +                            aiocb_info->sector = sector_nr;
>> +                            aiocb_info->nr_secs = nsects;
>> +                            aiocb_info->idx = idx;
>> +                            aiocb_info->i = i;
>> +
>> +                            ret = (NULL == bdrv_aio_write(s->bs, sector_nr,
>> +                                                      page, nsects,
>> +                                                      qemu_send_responses,
>> +                                                      aiocb_info));
> 
> Who de-allocates aiocb_info?

qemu_send_responses is a callback function which gets aiocb_info as
parameter and frees it when it's done.

I've attached a new version of the patch.

Kevin
diff -r 7530c4dba8a5 tools/blktap/drivers/blktapctrl.c
--- a/tools/blktap/drivers/blktapctrl.c Mon Mar  3 15:19:39 2008
+++ b/tools/blktap/drivers/blktapctrl.c Fri Mar 14 11:14:10 2008
@@ -501,6 +501,80 @@
        return 0;
 }
 
+/* Connect to qemu-dm */
+static int connect_qemu(blkif_t *blkif)
+{
+       char *rdctldev, *wrctldev;
+       
+       if (asprintf(&rdctldev, BLKTAP_CTRL_DIR "/qemu-read-%d", 
+                       blkif->domid) < 0)
+               return -1;
+
+       if (asprintf(&wrctldev, BLKTAP_CTRL_DIR "/qemu-write-%d", 
+                       blkif->domid) < 0) {
+               free(rdctldev);
+               return -1;
+       }
+
+       DPRINTF("Using qemu blktap pipe: %s\n", rdctldev);
+       
+       blkif->fds[READ] = open_ctrl_socket(wrctldev);
+       blkif->fds[WRITE] = open_ctrl_socket(rdctldev);
+       
+       free(rdctldev);
+       free(wrctldev);
+       
+       if (blkif->fds[READ] == -1 || blkif->fds[WRITE] == -1)
+               return -1;
+
+       DPRINTF("Attached to qemu blktap pipes\n");
+       return 0;
+}
+
+/* Launch tapdisk instance */
+static int connect_tapdisk(blkif_t *blkif, int minor)
+{
+       char *rdctldev = NULL, *wrctldev = NULL;
+       int ret = -1;
+
+       DPRINTF("tapdisk process does not exist:\n");
+
+       if (asprintf(&rdctldev,
+                    "%s/tapctrlread%d", BLKTAP_CTRL_DIR, minor) == -1)
+               goto fail;
+
+       if (asprintf(&wrctldev,
+                    "%s/tapctrlwrite%d", BLKTAP_CTRL_DIR, minor) == -1)
+               goto fail;
+       
+       blkif->fds[READ] = open_ctrl_socket(rdctldev);
+       blkif->fds[WRITE] = open_ctrl_socket(wrctldev);
+       
+       if (blkif->fds[READ] == -1 || blkif->fds[WRITE] == -1)
+               goto fail;
+
+       /*launch the new process*/
+       DPRINTF("Launching process, CMDLINE [tapdisk %s %s]\n",
+                       wrctldev, rdctldev);
+
+       if (launch_tapdisk(wrctldev, rdctldev) == -1) {
+               DPRINTF("Unable to fork, cmdline: [tapdisk %s %s]\n",
+                               wrctldev, rdctldev);
+               goto fail;
+       }
+
+       ret = 0;
+       
+fail:
+       if (rdctldev)
+               free(rdctldev);
+
+       if (wrctldev)
+               free(wrctldev);
+
+       return ret;
+}
+
 int blktapctrl_new_blkif(blkif_t *blkif)
 {
        blkif_info_t *blk;
@@ -524,30 +598,14 @@
                blkif->cookie = next_cookie++;
 
                if (!exist) {
-                       DPRINTF("Process does not exist:\n");
-                       if (asprintf(&rdctldev,
-                                    "%s/tapctrlread%d", BLKTAP_CTRL_DIR, 
minor) == -1)
-                               goto fail;
-                       if (asprintf(&wrctldev,
-                                    "%s/tapctrlwrite%d", BLKTAP_CTRL_DIR, 
minor) == -1) {
-                               free(rdctldev);
-                               goto fail;
+                       if (type == DISK_TYPE_IOEMU) {
+                               if (connect_qemu(blkif))
+                                       goto fail;
+                       } else {
+                               if (connect_tapdisk(blkif, minor))
+                                       goto fail;
                        }
-                       blkif->fds[READ] = open_ctrl_socket(rdctldev);
-                       blkif->fds[WRITE] = open_ctrl_socket(wrctldev);
-                       
-                       if (blkif->fds[READ] == -1 || blkif->fds[WRITE] == -1) 
-                               goto fail;
-
-                       /*launch the new process*/
-                       DPRINTF("Launching process, CMDLINE [tapdisk %s 
%s]\n",wrctldev, rdctldev);
-                       if (launch_tapdisk(wrctldev, rdctldev) == -1) {
-                               DPRINTF("Unable to fork, cmdline: [tapdisk %s 
%s]\n",wrctldev, rdctldev);
-                               goto fail;
-                       }
-
-                       free(rdctldev);
-                       free(wrctldev);
+
                } else {
                        DPRINTF("Process exists!\n");
                        blkif->fds[READ] = exist->fds[READ];
diff -r 7530c4dba8a5 tools/blktap/drivers/tapdisk.h
--- a/tools/blktap/drivers/tapdisk.h    Mon Mar  3 15:19:39 2008
+++ b/tools/blktap/drivers/tapdisk.h    Fri Mar 14 11:14:10 2008
@@ -167,6 +167,7 @@
 #define DISK_TYPE_RAM      3
 #define DISK_TYPE_QCOW     4
 #define DISK_TYPE_QCOW2    5
+#define DISK_TYPE_IOEMU    6
 
 
 /*Define Individual Disk Parameters here */
@@ -227,6 +228,16 @@
        0,
 #ifdef TAPDISK
        &tapdisk_qcow2,
+#endif
+};
+
+static disk_info_t ioemu_disk = {
+       DISK_TYPE_IOEMU,
+       "ioemu disk",
+       "ioemu",
+       0,
+#ifdef TAPDISK
+       NULL
 #endif
 };
 
@@ -238,6 +249,7 @@
        &ram_disk,
        &qcow_disk,
        &qcow2_disk,
+       &ioemu_disk,
 };
 
 typedef struct driver_list_entry {
diff -r 7530c4dba8a5 tools/blktap/lib/blktaplib.h
--- a/tools/blktap/lib/blktaplib.h      Mon Mar  3 15:19:39 2008
+++ b/tools/blktap/lib/blktaplib.h      Fri Mar 14 11:14:10 2008
@@ -221,15 +221,5 @@
      ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * getpagesize()) +    \
      ((_seg) * getpagesize()))
 
-/* Defines that are only used by library clients */
-
-#ifndef __COMPILING_BLKTAP_LIB
-
-static char *blkif_op_name[] = {
-       [BLKIF_OP_READ]       = "READ",
-       [BLKIF_OP_WRITE]      = "WRITE",
-};
-
-#endif /* __COMPILING_BLKTAP_LIB */
 
 #endif /* __BLKTAPLIB_H__ */
diff -r 7530c4dba8a5 tools/ioemu/Makefile.target
--- a/tools/ioemu/Makefile.target       Mon Mar  3 15:19:39 2008
+++ b/tools/ioemu/Makefile.target       Fri Mar 14 11:14:10 2008
@@ -17,6 +17,7 @@
 VPATH=$(SRC_PATH):$(TARGET_PATH):$(SRC_PATH)/hw:$(SRC_PATH)/audio
 CPPFLAGS+=-I. -I.. -I$(TARGET_PATH) -I$(SRC_PATH)
 CPPFLAGS+= -I$(XEN_ROOT)/tools/libxc
+CPPFLAGS+= -I$(XEN_ROOT)/tools/blktap/lib
 CPPFLAGS+= -I$(XEN_ROOT)/tools/xenstore
 CPPFLAGS+= -I$(XEN_ROOT)/tools/include
 ifdef CONFIG_DARWIN_USER
@@ -429,6 +430,7 @@
 VL_OBJS+= usb-uhci.o smbus_eeprom.o
 VL_OBJS+= piix4acpi.o
 VL_OBJS+= xenstore.o
+VL_OBJS+= xen_blktap.o
 VL_OBJS+= xen_platform.o
 VL_OBJS+= xen_machine_fv.o
 VL_OBJS+= xen_machine_pv.o
diff -r 7530c4dba8a5 tools/ioemu/hw/xen_machine_pv.c
--- a/tools/ioemu/hw/xen_machine_pv.c   Mon Mar  3 15:19:39 2008
+++ b/tools/ioemu/hw/xen_machine_pv.c   Fri Mar 14 11:14:10 2008
@@ -26,6 +26,9 @@
 #include "xen_console.h"
 #include "xenfb.h"
 
+extern void init_blktap(void);
+
+
 /* The Xen PV machine currently provides
  *   - a virtual framebuffer
  *   - ....
@@ -40,6 +43,10 @@
 {
     struct xenfb *xenfb;
     extern int domid;
+
+
+    /* Initialize tapdisk client */
+    init_blktap();
 
     /* Connect to text console */
     if (serial_hds[0]) {
diff -r 7530c4dba8a5 tools/ioemu/vl.c
--- a/tools/ioemu/vl.c  Mon Mar  3 15:19:39 2008
+++ b/tools/ioemu/vl.c  Fri Mar 14 11:14:10 2008
@@ -6266,6 +6266,12 @@
     powerdown_requested = 1;
     if (cpu_single_env)
         cpu_interrupt(cpu_single_env, CPU_INTERRUPT_EXIT);
+}
+
+static void qemu_sighup_handler(int signal)
+{
+    fprintf(stderr, "Received SIGHUP, terminating.\n");
+    exit(0);
 }
 
 void main_loop_wait(int timeout)
@@ -7976,7 +7982,7 @@
 
 #ifndef CONFIG_STUBDOM
     /* Unblock SIGTERM and SIGHUP, which may have been blocked by the caller */
-    signal(SIGHUP, SIG_DFL);
+    signal(SIGHUP, qemu_sighup_handler);
     sigemptyset(&set);
     sigaddset(&set, SIGTERM);
     sigaddset(&set, SIGHUP);
diff -r 7530c4dba8a5 tools/python/xen/xend/server/BlktapController.py
--- a/tools/python/xen/xend/server/BlktapController.py  Mon Mar  3 15:19:39 2008
+++ b/tools/python/xen/xend/server/BlktapController.py  Fri Mar 14 11:14:10 2008
@@ -13,7 +13,9 @@
     'vmdk',
     'ram',
     'qcow',
-    'qcow2'
+    'qcow2',
+
+    'ioemu'
     ]
 
 class BlktapController(BlkifController):
diff -r 7530c4dba8a5 tools/ioemu/hw/xen_blktap.c
--- /dev/null   Mon Mar  3 15:19:39 2008
+++ b/tools/ioemu/hw/xen_blktap.c       Fri Mar 14 11:14:10 2008
@@ -0,0 +1,686 @@
+/* xen_blktap.c
+ *
+ * Interface to blktapctrl to allow use of qemu block drivers with blktap.
+ * This file is based on tools/blktap/drivers/tapdisk.c
+ * 
+ * Copyright (c) 2005 Julian Chesterfield and Andrew Warfield.
+ * Copyright (c) 2008 Kevin Wolf
+ */
+
+/*
+ * There are several communication channels which are used by this interface:
+ *
+ *   - A pair of pipes for receiving and sending general control messages
+ *     (qemu-read-N and qemu-writeN in /var/run/tap, where N is the domain ID).
+ *     These control messages are handled by handle_blktap_ctrlmsg().
+ *
+ *   - One file descriptor per attached disk (/dev/xen/blktapN) for disk
+ *     specific control messages. A callback is triggered on this fd if there
+ *     is a new IO request. The callback function is handle_blktap_iomsg().
+ *
+ *   - A shared ring for each attached disk containing the actual IO requests 
+ *     and responses. Whenever handle_blktap_iomsg() is triggered it processes
+ *     the requests on this ring.
+ */
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+
+#include "vl.h"
+#include "blktaplib.h"
+#include "xen_blktap.h"
+#include "block_int.h"
+
+#define MSG_SIZE 4096
+
+#define BLKTAP_CTRL_DIR "/var/run/tap"
+
+/* If enabled, print debug messages to stderr */
+#if 1
+#define DPRINTF(_f, _a...) fprintf(stderr, __FILE__ ":%d: " _f, __LINE__, ##_a)
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+#if 1                                                                        
+#define ASSERT(_p) \
+    if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s\n", #_p , \
+        __LINE__, __FILE__); *(int*)0=0; }
+#else
+#define ASSERT(_p) ((void)0)
+#endif 
+
+
+extern int domid;
+
+int read_fd;
+int write_fd;
+
+static pid_t process;
+fd_list_entry_t *fd_start = NULL;
+
+static void handle_blktap_iomsg(void* private);
+
+struct aiocb_info {
+       struct td_state *s;
+       uint64_t sector;
+       int nr_secs;
+       int idx;
+       long i;
+};
+
+static void unmap_disk(struct td_state *s)
+{
+       tapdev_info_t *info = s->ring_info;
+       fd_list_entry_t *entry;
+       
+       bdrv_close(s->bs);
+
+       if (info != NULL && info->mem > 0)
+               munmap(info->mem, getpagesize() * BLKTAP_MMAP_REGION_SIZE);
+
+       entry = s->fd_entry;
+       *entry->pprev = entry->next;
+       if (entry->next)
+               entry->next->pprev = entry->pprev;
+
+       qemu_set_fd_handler2(info->fd, NULL, NULL, NULL, NULL);
+       close(info->fd);
+
+       free(s->fd_entry);
+       free(s->blkif);
+       free(s->ring_info);
+       free(s);
+
+       return;
+}
+
+static inline fd_list_entry_t *add_fd_entry(int tap_fd, struct td_state *s)
+{
+       fd_list_entry_t **pprev, *entry;
+
+       DPRINTF("Adding fd_list_entry\n");
+
+       /*Add to linked list*/
+       s->fd_entry   = entry = malloc(sizeof(fd_list_entry_t));
+       entry->tap_fd = tap_fd;
+       entry->s      = s;
+       entry->next   = NULL;
+
+       pprev = &fd_start;
+       while (*pprev != NULL)
+               pprev = &(*pprev)->next;
+
+       *pprev = entry;
+       entry->pprev = pprev;
+
+       return entry;
+}
+
+static inline struct td_state *get_state(int cookie)
+{
+       fd_list_entry_t *ptr;
+
+       ptr = fd_start;
+       while (ptr != NULL) {
+               if (ptr->cookie == cookie) return ptr->s;
+               ptr = ptr->next;
+       }
+       return NULL;
+}
+
+static struct td_state *state_init(void)
+{
+       int i;
+       struct td_state *s;
+       blkif_t *blkif;
+
+       s = malloc(sizeof(struct td_state));
+       blkif = s->blkif = malloc(sizeof(blkif_t));
+       s->ring_info = calloc(1, sizeof(tapdev_info_t));
+
+       for (i = 0; i < MAX_REQUESTS; i++) {
+               blkif->pending_list[i].secs_pending = 0;
+               blkif->pending_list[i].submitting = 0;
+       }
+
+       return s;
+}
+
+static int map_new_dev(struct td_state *s, int minor)
+{
+       int tap_fd;
+       tapdev_info_t *info = s->ring_info;
+       char *devname;
+       fd_list_entry_t *ptr;
+       int page_size;
+
+       if (asprintf(&devname,"%s/%s%d", BLKTAP_DEV_DIR, BLKTAP_DEV_NAME, 
minor) == -1)
+               return -1;
+       tap_fd = open(devname, O_RDWR);
+       if (tap_fd == -1) 
+       {
+               DPRINTF("open failed on dev %s!\n",devname);
+               goto fail;
+       } 
+       info->fd = tap_fd;
+
+       /*Map the shared memory*/
+       page_size = getpagesize();
+       info->mem = mmap(0, page_size * BLKTAP_MMAP_REGION_SIZE, 
+                         PROT_READ | PROT_WRITE, MAP_SHARED, info->fd, 0);
+       if ((long int)info->mem == -1) 
+       {
+               DPRINTF("mmap failed on dev %s!\n",devname);
+               goto fail;
+       }
+
+       /* assign the rings to the mapped memory */ 
+       info->sring = (blkif_sring_t *)((unsigned long)info->mem);
+       BACK_RING_INIT(&info->fe_ring, info->sring, page_size);
+       
+       info->vstart = 
+               (unsigned long)info->mem + (BLKTAP_RING_PAGES * page_size);
+
+       ioctl(info->fd, BLKTAP_IOCTL_SENDPID, process );
+       ioctl(info->fd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE );
+       free(devname);
+
+       /*Update the fd entry*/
+       ptr = fd_start;
+       while (ptr != NULL) {
+               if (s == ptr->s) {
+                       ptr->tap_fd = tap_fd;
+
+                       /* Setup fd_handler for qemu main loop */
+                       DPRINTF("set tap_fd = %d\n", tap_fd);
+                       qemu_set_fd_handler2(tap_fd, NULL, 
&handle_blktap_iomsg, NULL, s);
+
+                       break;
+               }
+               ptr = ptr->next;
+       }       
+
+
+       DPRINTF("map_new_dev = %d\n", minor);
+       return minor;
+
+ fail:
+       free(devname);
+       return -1;
+}
+
+static int open_disk(struct td_state *s, char *path, int readonly)
+{
+       struct disk_id id;
+       BlockDriverState* bs;
+
+       DPRINTF("Opening %s\n", path);
+       bs = calloc(1, sizeof(*bs));
+
+       memset(&id, 0, sizeof(struct disk_id));
+
+       if (bdrv_open(bs, path, 0) != 0) {
+               fprintf(stderr, "Could not open image file %s\n", path);
+               return -ENOMEM;
+       }
+
+       s->bs = bs;
+       s->flags = readonly ? TD_RDONLY : 0;
+       s->size = bs->total_sectors;
+       s->sector_size = 512;
+
+       s->info = ((s->flags & TD_RDONLY) ? VDISK_READONLY : 0);
+
+       return 0;
+}
+
+static inline void write_rsp_to_ring(struct td_state *s, blkif_response_t *rsp)
+{
+       tapdev_info_t *info = s->ring_info;
+       blkif_response_t *rsp_d;
+       
+       rsp_d = RING_GET_RESPONSE(&info->fe_ring, info->fe_ring.rsp_prod_pvt);
+       memcpy(rsp_d, rsp, sizeof(blkif_response_t));
+       info->fe_ring.rsp_prod_pvt++;
+}
+
+static inline void kick_responses(struct td_state *s)
+{
+       tapdev_info_t *info = s->ring_info;
+
+       if (info->fe_ring.rsp_prod_pvt != info->fe_ring.sring->rsp_prod) 
+       {
+               RING_PUSH_RESPONSES(&info->fe_ring);
+               ioctl(info->fd, BLKTAP_IOCTL_KICK_FE);
+       }
+}
+
+static int send_responses(struct td_state *s, int res, 
+                  uint64_t sector, int nr_secs, int idx, void *private)
+{
+       pending_req_t   *preq;
+       blkif_request_t *req;
+       int responses_queued = 0;
+       blkif_t *blkif = s->blkif;
+       int secs_done = nr_secs;
+
+       if ( (idx > MAX_REQUESTS-1) )
+       {
+               DPRINTF("invalid index returned(%u)!\n", idx);
+               return 0;
+       }
+       preq = &blkif->pending_list[idx];
+       req  = &preq->req;
+
+       preq->secs_pending -= secs_done;
+
+       if (res == -EBUSY && preq->submitting) 
+               return -EBUSY;  /* propagate -EBUSY back to higher layers */
+       if (res) 
+               preq->status = BLKIF_RSP_ERROR;
+       
+       if (!preq->submitting && preq->secs_pending == 0) 
+       {
+               blkif_request_t tmp;
+               blkif_response_t *rsp;
+
+               tmp = preq->req;
+               rsp = (blkif_response_t *)req;
+               
+               rsp->id = tmp.id;
+               rsp->operation = tmp.operation;
+               rsp->status = preq->status;
+               
+               write_rsp_to_ring(s, rsp);
+               responses_queued++;
+
+               kick_responses(s);
+       }
+       
+       return responses_queued;
+}
+
+static void qemu_send_responses(void* opaque, int ret)
+{
+       struct aiocb_info* info = opaque;
+
+       if (ret != 0) {
+               DPRINTF("ERROR: ret = %d (%s)\n", ret, strerror(-ret));
+       }
+
+       send_responses(info->s, ret, info->sector, info->nr_secs, 
+               info->idx, (void*) info->i);
+       free(info);
+}
+
+/**
+ * Callback function for the IO message pipe. Reads requests from the ring
+ * and processes them (call qemu read/write functions).
+ *
+ * The private parameter points to the struct td_state representing the
+ * disk the request is targeted at.
+ */
+static void handle_blktap_iomsg(void* private)
+{
+       struct td_state* s = private;
+
+       RING_IDX          rp, j, i;
+       blkif_request_t  *req;
+       int idx, nsects, ret;
+       uint64_t sector_nr;
+       uint8_t *page;
+       blkif_t *blkif = s->blkif;
+       tapdev_info_t *info = s->ring_info;
+       int page_size = getpagesize();
+
+       struct aiocb_info *aiocb_info;
+
+       if (info->fe_ring.sring == NULL) {
+               DPRINTF("  sring == NULL, ignoring IO request\n");
+               return;
+       }
+
+       rp = info->fe_ring.sring->req_prod; 
+       xen_rmb();
+
+       for (j = info->fe_ring.req_cons; j != rp; j++)
+       {
+               int start_seg = 0; 
+
+               req = NULL;
+               req = RING_GET_REQUEST(&info->fe_ring, j);
+               ++info->fe_ring.req_cons;
+               
+               if (req == NULL)
+                       continue;
+
+               idx = req->id;
+
+               ASSERT(blkif->pending_list[idx].secs_pending == 0);
+               memcpy(&blkif->pending_list[idx].req, req, sizeof(*req));
+               blkif->pending_list[idx].status = BLKIF_RSP_OKAY;
+               blkif->pending_list[idx].submitting = 1;
+               sector_nr = req->sector_number;
+
+               /* Don't allow writes on readonly devices */
+               if ((s->flags & TD_RDONLY) && 
+                   (req->operation == BLKIF_OP_WRITE)) {
+                       blkif->pending_list[idx].status = BLKIF_RSP_ERROR;
+                       goto send_response;
+               }
+
+               for (i = start_seg; i < req->nr_segments; i++) {
+                       nsects = req->seg[i].last_sect - 
+                                req->seg[i].first_sect + 1;
+       
+                       if ((req->seg[i].last_sect >= page_size >> 9) ||
+                                       (nsects <= 0))
+                               continue;
+
+                       page  = (uint8_t*) MMAP_VADDR(info->vstart, 
+                                                  (unsigned long)req->id, i);
+                       page += (req->seg[i].first_sect << SECTOR_SHIFT);
+
+                       if (sector_nr >= s->size) {
+                               DPRINTF("Sector request failed:\n");
+                               DPRINTF("%s request, idx [%d,%d] size [%llu], "
+                                       "sector [%llu,%llu]\n",
+                                       (req->operation == BLKIF_OP_WRITE ? 
+                                        "WRITE" : "READ"),
+                                       idx,i,
+                                       (long long unsigned) 
+                                               nsects<<SECTOR_SHIFT,
+                                       (long long unsigned) 
+                                               sector_nr<<SECTOR_SHIFT,
+                                       (long long unsigned) sector_nr);
+                               continue;
+                       }
+
+                       blkif->pending_list[idx].secs_pending += nsects;
+
+                       switch (req->operation) 
+                       {
+                       case BLKIF_OP_WRITE:
+                               aiocb_info = malloc(sizeof(*aiocb_info));
+
+                               aiocb_info->s = s;
+                               aiocb_info->sector = sector_nr;
+                               aiocb_info->nr_secs = nsects;
+                               aiocb_info->idx = idx;
+                               aiocb_info->i = i;
+
+                               ret = (NULL == bdrv_aio_write(s->bs, sector_nr,
+                                                         page, nsects,
+                                                         qemu_send_responses,
+                                                         aiocb_info));
+
+                               if (ret) {
+                                       blkif->pending_list[idx].status = 
BLKIF_RSP_ERROR;
+                                       DPRINTF("ERROR: bdrv_write() == 
NULL\n");
+                                       goto send_response;
+                               }
+                               break;
+
+                       case BLKIF_OP_READ:
+                               aiocb_info = malloc(sizeof(*aiocb_info));
+
+                               aiocb_info->s = s;
+                               aiocb_info->sector = sector_nr;
+                               aiocb_info->nr_secs = nsects;
+                               aiocb_info->idx = idx;
+                               aiocb_info->i = i;
+
+                               ret = (NULL == bdrv_aio_read(s->bs, sector_nr,
+                                                        page, nsects,
+                                                        qemu_send_responses,
+                                                        aiocb_info));
+
+                               if (ret) {
+                                       blkif->pending_list[idx].status = 
BLKIF_RSP_ERROR;
+                                       DPRINTF("ERROR: bdrv_read() == NULL\n");
+                                       goto send_response;
+                               }
+                               break;
+
+                       default:
+                               DPRINTF("Unknown block operation\n");
+                               break;
+                       }
+                       sector_nr += nsects;
+               }
+       send_response:
+               blkif->pending_list[idx].submitting = 0;
+
+               /* force write_rsp_to_ring for synchronous case */
+               if (blkif->pending_list[idx].secs_pending == 0)
+                       send_responses(s, 0, 0, 0, idx, (void *)(long)0);
+       }
+}
+
+/**
+ * Callback function for the qemu-read pipe. Reads and processes control 
+ * message from the pipe.
+ *
+ * The parameter private is unused.
+ */
+static void handle_blktap_ctrlmsg(void* private)
+{
+       int length, len, msglen;
+       char *ptr, *path;
+       image_t *img;
+       msg_hdr_t *msg;
+       msg_newdev_t *msg_dev;
+       msg_pid_t *msg_pid;
+       int ret = -1;
+       struct td_state *s = NULL;
+       fd_list_entry_t *entry;
+
+       char buf[MSG_SIZE];
+
+       length = read(read_fd, buf, MSG_SIZE);
+
+       if (length > 0 && length >= sizeof(msg_hdr_t)) 
+       {
+               msg = (msg_hdr_t *)buf;
+               DPRINTF("blktap: Received msg, len %d, type %d, UID %d\n",
+                       length,msg->type,msg->cookie);
+
+               switch (msg->type) {
+               case CTLMSG_PARAMS:                     
+                       ptr = buf + sizeof(msg_hdr_t);
+                       len = (length - sizeof(msg_hdr_t));
+                       path = calloc(1, len + 1);
+                       
+                       memcpy(path, ptr, len); 
+                       DPRINTF("Received CTLMSG_PARAMS: [%s]\n", path);
+
+                       /* Allocate the disk structs */
+                       s = state_init();
+
+                       /*Open file*/
+                       if (s == NULL || open_disk(s, path, msg->readonly)) {
+                               msglen = sizeof(msg_hdr_t);
+                               msg->type = CTLMSG_IMG_FAIL;
+                               msg->len = msglen;
+                       } else {
+                               entry = add_fd_entry(0, s);
+                               entry->cookie = msg->cookie;
+                               DPRINTF("Entered cookie %d\n", entry->cookie);
+                               
+                               memset(buf, 0x00, MSG_SIZE); 
+                       
+                               msglen = sizeof(msg_hdr_t) + sizeof(image_t);
+                               msg->type = CTLMSG_IMG;
+                               img = (image_t *)(buf + sizeof(msg_hdr_t));
+                               img->size = s->size;
+                               img->secsize = s->sector_size;
+                               img->info = s->info;
+                               DPRINTF("Writing (size, secsize, info) = "
+                                       "(%#" PRIx64 ", %#" PRIx64 ", %d)\n",
+                                       s->size, s->sector_size, s->info);
+                       }
+                       len = write(write_fd, buf, msglen);
+                       free(path);
+                       break;
+                       
+               case CTLMSG_NEWDEV:
+                       msg_dev = (msg_newdev_t *)(buf + sizeof(msg_hdr_t));
+
+                       s = get_state(msg->cookie);
+                       DPRINTF("Retrieving state, cookie %d.....[%s]\n",
+                               msg->cookie, (s == NULL ? "FAIL":"OK"));
+                       if (s != NULL) {
+                               ret = ((map_new_dev(s, msg_dev->devnum) 
+                                       == msg_dev->devnum ? 0: -1));
+                       }       
+
+                       memset(buf, 0x00, MSG_SIZE); 
+                       msglen = sizeof(msg_hdr_t);
+                       msg->type = (ret == 0 ? CTLMSG_NEWDEV_RSP 
+                                             : CTLMSG_NEWDEV_FAIL);
+                       msg->len = msglen;
+
+                       len = write(write_fd, buf, msglen);
+                       break;
+
+               case CTLMSG_CLOSE:
+                       s = get_state(msg->cookie);
+                       if (s) unmap_disk(s);
+                       break;                  
+
+               case CTLMSG_PID:
+                       memset(buf, 0x00, MSG_SIZE);
+                       msglen = sizeof(msg_hdr_t) + sizeof(msg_pid_t);
+                       msg->type = CTLMSG_PID_RSP;
+                       msg->len = msglen;
+
+                       msg_pid = (msg_pid_t *)(buf + sizeof(msg_hdr_t));
+                       process = getpid();
+                       msg_pid->pid = process;
+
+                       len = write(write_fd, buf, msglen);
+                       break;
+
+               default:
+                       break;
+               }
+       }
+}
+
+/**
+ * Opens a control socket, i.e. a pipe to communicate with blktapctrl.
+ *
+ * Returns the file descriptor number for the pipe; -1 in error case
+ */
+static int open_ctrl_socket(char *devname)
+{
+       int ret;
+       int ipc_fd;
+
+       if (mkdir(BLKTAP_CTRL_DIR, 0755) == 0)
+               DPRINTF("Created %s directory\n", BLKTAP_CTRL_DIR);
+
+       ret = mkfifo(devname,S_IRWXU|S_IRWXG|S_IRWXO);
+       if ( (ret != 0) && (errno != EEXIST) ) {
+               DPRINTF("ERROR: pipe failed (%d)\n", errno);
+               return -1;
+       }
+
+       ipc_fd = open(devname,O_RDWR|O_NONBLOCK);
+
+       if (ipc_fd < 0) {
+               DPRINTF("FD open failed\n");
+               return -1;
+       }
+
+       return ipc_fd;
+}
+
+/**
+ * Unmaps all disks and closes their pipes
+ */
+void shutdown_blktap(void)
+{
+       fd_list_entry_t *ptr;
+       struct td_state *s;
+       char *devname;
+
+       DPRINTF("Shutdown blktap\n");
+
+       /* Unmap all disks */
+       ptr = fd_start;
+       while (ptr != NULL) {
+               s = ptr->s;
+               unmap_disk(s);
+               close(ptr->tap_fd);
+               ptr = ptr->next;
+       }
+
+       /* Delete control pipes */
+       if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-read-%d", domid) >= 0) {
+               DPRINTF("Delete %s\n", devname);
+               if (unlink(devname))
+                       DPRINTF("Could not delete: %s\n", strerror(errno));
+               free(devname);
+       }
+       
+       if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-write-%d", domid) >= 0) { 
+               DPRINTF("Delete %s\n", devname);
+               if (unlink(devname))
+                       DPRINTF("Could not delete: %s\n", strerror(errno));
+               free(devname);
+       }
+}
+
+/**
+ * Initialize the blktap interface, i.e. open a pair of pipes in /var/run/tap
+ * and register a fd handler.
+ *
+ * Returns 0 on success.
+ */
+int init_blktap(void)
+{
+       char* devname;  
+
+       DPRINTF("Init blktap pipes\n");
+
+       /* Open the read pipe */
+       if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-read-%d", domid) >= 0) {  
+               read_fd = open_ctrl_socket(devname);            
+               free(devname);
+               
+               if (read_fd == -1) {
+                       fprintf(stderr, "Could not open %s/qemu-read-%d\n",
+                               BLKTAP_CTRL_DIR, domid);
+                       return -1;
+               }
+       }
+       
+       /* Open the write pipe */
+       if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-write-%d", domid) >= 0) {
+               write_fd = open_ctrl_socket(devname);
+               free(devname);
+               
+               if (write_fd == -1) {
+                       fprintf(stderr, "Could not open %s/qemu-write-%d\n",
+                               BLKTAP_CTRL_DIR, domid);
+                       close(read_fd);
+                       return -1;
+               }
+       }
+
+       /* Attach a handler to the read pipe (called from qemu main loop) */
+       qemu_set_fd_handler2(read_fd, NULL, &handle_blktap_ctrlmsg, NULL, NULL);
+
+       /* Register handler to clean up when the domain is destroyed */
+       atexit(&shutdown_blktap);
+
+       return 0;
+}
diff -r 7530c4dba8a5 tools/ioemu/hw/xen_blktap.h
--- /dev/null   Mon Mar  3 15:19:39 2008
+++ b/tools/ioemu/hw/xen_blktap.h       Fri Mar 14 11:14:10 2008
@@ -0,0 +1,57 @@
+/* xen_blktap.h
+ *
+ * Generic disk interface for blktap-based image adapters.
+ *
+ * (c) 2006 Andrew Warfield and Julian Chesterfield
+ */
+
+#ifndef XEN_BLKTAP_H_ 
+#define XEN_BLKTAP_H_
+
+#include <stdint.h>
+#include <syslog.h>
+#include <stdio.h>
+
+#include "block_int.h"
+
+/* Things disks need to know about, these should probably be in a higher-level
+ * header. */
+#define MAX_SEGMENTS_PER_REQ    11
+#define SECTOR_SHIFT             9
+#define DEFAULT_SECTOR_SIZE    512
+
+#define MAX_IOFD                 2
+
+#define BLK_NOT_ALLOCATED       99
+#define TD_NO_PARENT             1
+
+typedef uint32_t td_flag_t;
+
+#define TD_RDONLY                1
+
+struct disk_id {
+       char *name;
+       int drivertype;
+};
+
+/* This structure represents the state of an active virtual disk.           */
+struct td_state {
+       BlockDriverState* bs;
+       td_flag_t flags;
+       void *blkif;
+       void *image;
+       void *ring_info;
+       void *fd_entry;
+       uint64_t sector_size;
+       uint64_t size;
+       unsigned int       info;
+};
+
+typedef struct fd_list_entry {
+       int cookie;
+       int  tap_fd;
+       struct td_state *s;
+       struct fd_list_entry **pprev, *next;
+} fd_list_entry_t;
+
+#endif /*XEN_BLKTAP_H_*/
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel