# HG changeset patch
# User kfraser@xxxxxxxxxxxxxxxxxxxxx
# Date 1182267148 -3600
# Node ID eeeb77195ac20e8e08c57a7248913bedc0d999bf
# Parent 865c4ae59be3b0aa7e375b929414244d1e5bdf74
blktap: Add fallback code to blktap for missing poll-on-aio support.
blktap requires a xen specific kernel AIO ABI which has been vetoed by
upstream in favour of another approach. Rather than include this ABI,
Fedora has been carrying a patch which makes tap:aio use a thread to
poll for aio events and notify the main thread via a pipe.
The upstream approach of allowing io_getevents() poll normal file
descriptors via epoll is still progressing:
http://lkml.org/lkml/2007/1/3/16
but when that does make it upstream, blktap will require significant
re-working to use that approach.
In the meantime, here's a patch which uses the poll-in-a-thread
approach only if AIO poll support isn't available. It also hides the
details behind a simple abstraction and makes both tap:aio and
tap:qcow use it.
Signed-off-by: Mark McLoughlin <markmc@xxxxxxxxxx>
---
tools/blktap/drivers/Makefile | 1
tools/blktap/drivers/block-aio.c | 49 ++++-------
tools/blktap/drivers/block-qcow.c | 48 +++++------
tools/blktap/drivers/tapaio.c | 164 ++++++++++++++++++++++++++++++++++++++
tools/blktap/drivers/tapaio.h | 58 +++++++++++++
5 files changed, 267 insertions(+), 53 deletions(-)
diff -r 865c4ae59be3 -r eeeb77195ac2 tools/blktap/drivers/Makefile
--- a/tools/blktap/drivers/Makefile Tue Jun 19 16:29:22 2007 +0100
+++ b/tools/blktap/drivers/Makefile Tue Jun 19 16:32:28 2007 +0100
@@ -35,6 +35,7 @@ BLK-OBJS += block-ram.o
BLK-OBJS += block-ram.o
BLK-OBJS += block-qcow.o
BLK-OBJS += aes.o
+BLK-OBJS += tapaio.o
all: $(IBIN) qcow-util
diff -r 865c4ae59be3 -r eeeb77195ac2 tools/blktap/drivers/block-aio.c
--- a/tools/blktap/drivers/block-aio.c Tue Jun 19 16:29:22 2007 +0100
+++ b/tools/blktap/drivers/block-aio.c Tue Jun 19 16:32:28 2007 +0100
@@ -43,14 +43,7 @@
#include <sys/ioctl.h>
#include <linux/fs.h>
#include "tapdisk.h"
-
-
-/**
- * We used a kernel patch to return an fd associated with the AIO context
- * so that we can concurrently poll on synchronous and async descriptors.
- * This is signalled by passing 1 as the io context to io_setup.
- */
-#define REQUEST_ASYNC_FD 1
+#include "tapaio.h"
#define MAX_AIO_REQS (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ)
@@ -65,14 +58,13 @@ struct tdaio_state {
int fd;
/* libaio state */
- io_context_t aio_ctx;
+ tap_aio_context_t aio_ctx;
struct iocb iocb_list [MAX_AIO_REQS];
struct iocb *iocb_free [MAX_AIO_REQS];
struct pending_aio pending_aio[MAX_AIO_REQS];
int iocb_free_count;
struct iocb *iocb_queue[MAX_AIO_REQS];
int iocb_queued;
- int poll_fd; /* NB: we require aio_poll support */
struct io_event aio_events[MAX_AIO_REQS];
};
@@ -148,7 +140,7 @@ static inline void init_fds(struct disk_
for(i = 0; i < MAX_IOFD; i++)
dd->io_fd[i] = 0;
- dd->io_fd[0] = prv->poll_fd;
+ dd->io_fd[0] = prv->aio_ctx.pollfd;
}
/* Open the disk file and initialize aio state. */
@@ -162,12 +154,9 @@ int tdaio_open (struct disk_driver *dd,
/* Initialize AIO */
prv->iocb_free_count = MAX_AIO_REQS;
prv->iocb_queued = 0;
-
- prv->aio_ctx = (io_context_t) REQUEST_ASYNC_FD;
- prv->poll_fd = io_setup(MAX_AIO_REQS, &prv->aio_ctx);
-
- if (prv->poll_fd < 0) {
- ret = prv->poll_fd;
+
+ ret = tap_aio_setup(&prv->aio_ctx, prv->aio_events, MAX_AIO_REQS);
+ if (ret < 0) {
if (ret == -EAGAIN) {
DPRINTF("Couldn't setup AIO context. If you are "
"trying to concurrently use a large number "
@@ -176,9 +165,7 @@ int tdaio_open (struct disk_driver *dd,
"(e.g. 'echo echo 1048576 > /proc/sys/fs/"
"aio-max-nr')\n");
} else {
- DPRINTF("Couldn't get fd for AIO poll support. This "
- "is probably because your kernel does not "
- "have the aio-poll patch applied.\n");
+ DPRINTF("Couldn't setup AIO context.\n");
}
goto done;
}
@@ -286,7 +273,7 @@ int tdaio_submit(struct disk_driver *dd)
if (!prv->iocb_queued)
return 0;
- ret = io_submit(prv->aio_ctx, prv->iocb_queued, prv->iocb_queue);
+ ret = io_submit(prv->aio_ctx.aio_ctx, prv->iocb_queued,
prv->iocb_queue);
/* XXX: TODO: Handle error conditions here. */
@@ -300,7 +287,7 @@ int tdaio_close(struct disk_driver *dd)
{
struct tdaio_state *prv = (struct tdaio_state *)dd->private;
- io_destroy(prv->aio_ctx);
+ io_destroy(prv->aio_ctx.aio_ctx);
close(prv->fd);
return 0;
@@ -308,15 +295,13 @@ int tdaio_close(struct disk_driver *dd)
int tdaio_do_callbacks(struct disk_driver *dd, int sid)
{
- int ret, i, rsp = 0;
+ int i, nr_events, rsp = 0;
struct io_event *ep;
struct tdaio_state *prv = (struct tdaio_state *)dd->private;
- /* Non-blocking test for completed io. */
- ret = io_getevents(prv->aio_ctx, 0, MAX_AIO_REQS, prv->aio_events,
- NULL);
-
- for (ep=prv->aio_events,i=ret; i-->0; ep++) {
+ nr_events = tap_aio_get_events(&prv->aio_ctx);
+repeat:
+ for (ep = prv->aio_events, i = nr_events; i-- > 0; ep++) {
struct iocb *io = ep->obj;
struct pending_aio *pio;
@@ -327,6 +312,14 @@ int tdaio_do_callbacks(struct disk_drive
prv->iocb_free[prv->iocb_free_count++] = io;
}
+
+ if (nr_events) {
+ nr_events = tap_aio_more_events(&prv->aio_ctx);
+ goto repeat;
+ }
+
+ tap_aio_continue(&prv->aio_ctx);
+
return rsp;
}
diff -r 865c4ae59be3 -r eeeb77195ac2 tools/blktap/drivers/block-qcow.c
--- a/tools/blktap/drivers/block-qcow.c Tue Jun 19 16:29:22 2007 +0100
+++ b/tools/blktap/drivers/block-qcow.c Tue Jun 19 16:32:28 2007 +0100
@@ -38,6 +38,7 @@
#include "bswap.h"
#include "aes.h"
#include "tapdisk.h"
+#include "tapaio.h"
#if 1
#define ASSERT(_p) \
@@ -52,9 +53,6 @@
(uint64_t)( \
(l + (s - 1)) - ((l + (s - 1)) % s)); \
})
-
-/******AIO DEFINES******/
-#define REQUEST_ASYNC_FD 1
struct pending_aio {
td_callback_t cb;
@@ -145,7 +143,7 @@ struct tdqcow_state {
AES_KEY aes_encrypt_key; /*AES key*/
AES_KEY aes_decrypt_key; /*AES key*/
/* libaio state */
- io_context_t aio_ctx;
+ tap_aio_context_t aio_ctx;
int max_aio_reqs;
struct iocb *iocb_list;
struct iocb **iocb_free;
@@ -153,7 +151,6 @@ struct tdqcow_state {
int iocb_free_count;
struct iocb **iocb_queue;
int iocb_queued;
- int poll_fd; /* NB: we require aio_poll support */
struct io_event *aio_events;
};
@@ -179,7 +176,7 @@ static void free_aio_state(struct disk_d
static int init_aio_state(struct disk_driver *dd)
{
- int i;
+ int i, ret;
struct td_state *bs = dd->td_state;
struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
long ioidx;
@@ -216,12 +213,9 @@ static int init_aio_state(struct disk_dr
goto fail;
}
- /*Signal kernel to create Poll FD for Asyc completion events*/
- s->aio_ctx = (io_context_t) REQUEST_ASYNC_FD;
- s->poll_fd = io_setup(s->max_aio_reqs, &s->aio_ctx);
-
- if (s->poll_fd < 0) {
- if (s->poll_fd == -EAGAIN) {
+ ret = tap_aio_setup(&s->aio_ctx, s->aio_events, s->max_aio_reqs);
+ if (ret < 0) {
+ if (ret == -EAGAIN) {
DPRINTF("Couldn't setup AIO context. If you are "
"trying to concurrently use a large number "
"of blktap-based disks, you may need to "
@@ -229,9 +223,7 @@ static int init_aio_state(struct disk_dr
"(e.g. 'echo echo 1048576 > /proc/sys/fs/"
"aio-max-nr')\n");
} else {
- DPRINTF("Couldn't get fd for AIO poll support. This "
- "is probably because your kernel does not "
- "have the aio-poll patch applied.\n");
+ DPRINTF("Couldn't setup AIO context.\n");
}
goto fail;
}
@@ -845,7 +837,7 @@ static inline void init_fds(struct disk_
for(i = 0; i < MAX_IOFD; i++)
dd->io_fd[i] = 0;
- dd->io_fd[0] = s->poll_fd;
+ dd->io_fd[0] = s->aio_ctx.pollfd;
}
/* Open the disk file and initialize qcow state. */
@@ -1144,7 +1136,7 @@ int tdqcow_submit(struct disk_driver *dd
if (!prv->iocb_queued)
return 0;
- ret = io_submit(prv->aio_ctx, prv->iocb_queued, prv->iocb_queue);
+ ret = io_submit(prv->aio_ctx.aio_ctx, prv->iocb_queued,
prv->iocb_queue);
/* XXX: TODO: Handle error conditions here. */
@@ -1172,7 +1164,7 @@ int tdqcow_close(struct disk_driver *dd)
close(fd);
}
- io_destroy(s->aio_ctx);
+ io_destroy(s->aio_ctx.aio_ctx);
free(s->name);
free(s->l1_table);
free(s->l2_cache);
@@ -1184,17 +1176,15 @@ int tdqcow_close(struct disk_driver *dd)
int tdqcow_do_callbacks(struct disk_driver *dd, int sid)
{
- int ret, i, rsp = 0,*ptr;
+ int ret, i, nr_events, rsp = 0,*ptr;
struct io_event *ep;
struct tdqcow_state *prv = (struct tdqcow_state *)dd->private;
if (sid > MAX_IOFD) return 1;
-
- /* Non-blocking test for completed io. */
- ret = io_getevents(prv->aio_ctx, 0, prv->max_aio_reqs, prv->aio_events,
- NULL);
-
- for (ep = prv->aio_events, i = ret; i-- > 0; ep++) {
+
+ nr_events = tap_aio_get_events(&prv->aio_ctx);
+repeat:
+ for (ep = prv->aio_events, i = nr_events; i-- > 0; ep++) {
struct iocb *io = ep->obj;
struct pending_aio *pio;
@@ -1215,6 +1205,14 @@ int tdqcow_do_callbacks(struct disk_driv
prv->iocb_free[prv->iocb_free_count++] = io;
}
+
+ if (nr_events) {
+ nr_events = tap_aio_more_events(&prv->aio_ctx);
+ goto repeat;
+ }
+
+ tap_aio_continue(&prv->aio_ctx);
+
return rsp;
}
diff -r 865c4ae59be3 -r eeeb77195ac2 tools/blktap/drivers/tapaio.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap/drivers/tapaio.c Tue Jun 19 16:32:28 2007 +0100
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2006 Andrew Warfield and Julian Chesterfield
+ * Copyright (c) 2007 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "tapaio.h"
+#include "tapdisk.h"
+#include <unistd.h>
+
+/**
+ * We used a kernel patch to return an fd associated with the AIO context
+ * so that we can concurrently poll on synchronous and async descriptors.
+ * This is signalled by passing 1 as the io context to io_setup.
+ */
+#define REQUEST_ASYNC_FD 1
+
+/*
+ * If we don't have any way to do epoll on aio events in a normal kernel,
+ * wait for aio events in a separate thread and return completion status
+ * that via a pipe that can be waited on normally.
+ *
+ * To keep locking problems between the completion thread and the submit
+ * thread to a minimum, there's a handshake which allows only one thread
+ * to be doing work on the completion queue at a time:
+ *
+ * 1) main thread sends completion thread a command via the command pipe;
+ * 2) completion thread waits for aio events and returns the number
+ * received on the completion pipe
+ * 3) main thread processes the received ctx->aio_events events
+ * 4) loop back to 1) to let the completion thread refill the aio_events
+ * buffer.
+ *
+ * This workaround needs to disappear once the kernel provides a single
+ * mechanism for waiting on both aio and normal fd wakeups.
+ */
+static void *
+tap_aio_completion_thread(void *arg)
+{
+ tap_aio_context_t *ctx = (tap_aio_context_t *) arg;
+ int command;
+ int nr_events;
+ int rc;
+
+ while (1) {
+ rc = read(ctx->command_fd[0], &command, sizeof(command));
+
+ do {
+ rc = io_getevents(ctx->aio_ctx, 1,
+ ctx->max_aio_events, ctx->aio_events,
+ NULL);
+ if (rc) {
+ nr_events = rc;
+ rc = write(ctx->completion_fd[1], &nr_events,
+ sizeof(nr_events));
+ }
+ } while (!rc);
+ }
+}
+
+void
+tap_aio_continue(tap_aio_context_t *ctx)
+{
+ int cmd = 0;
+
+ if (!ctx->poll_in_thread)
+ return;
+
+ if (write(ctx->command_fd[1], &cmd, sizeof(cmd)) < 0)
+ DPRINTF("Cannot write to command pipe\n");
+}
+
+int
+tap_aio_setup(tap_aio_context_t *ctx,
+ struct io_event *aio_events,
+ int max_aio_events)
+{
+ int ret;
+
+ ctx->aio_events = aio_events;
+ ctx->max_aio_events = max_aio_events;
+ ctx->poll_in_thread = 0;
+
+ ctx->aio_ctx = (io_context_t) REQUEST_ASYNC_FD;
+ ret = io_setup(ctx->max_aio_events, &ctx->aio_ctx);
+ if (ret < 0 && ret != -EINVAL)
+ return ret;
+ else if (ret > 0) {
+ ctx->pollfd = ret;
+ return ctx->pollfd;
+ }
+
+ ctx->aio_ctx = (io_context_t) 0;
+ ret = io_setup(ctx->max_aio_events, &ctx->aio_ctx);
+ if (ret < 0)
+ return ret;
+
+ if ((ret = pipe(ctx->command_fd)) < 0) {
+ DPRINTF("Unable to create command pipe\n");
+ return -1;
+ }
+ if ((ret = pipe(ctx->completion_fd)) < 0) {
+ DPRINTF("Unable to create completion pipe\n");
+ return -1;
+ }
+
+ if ((ret = pthread_create(&ctx->aio_thread, NULL,
+ tap_aio_completion_thread, ctx)) != 0) {
+ DPRINTF("Unable to create completion thread\n");
+ return -1;
+ }
+
+ ctx->pollfd = ctx->completion_fd[0];
+ ctx->poll_in_thread = 1;
+
+ tap_aio_continue(ctx);
+
+ return 0;
+}
+
+int
+tap_aio_get_events(tap_aio_context_t *ctx)
+{
+ int nr_events = 0;
+
+ if (!ctx->poll_in_thread)
+ nr_events = io_getevents(ctx->aio_ctx, 1,
+ ctx->max_aio_events, ctx->aio_events,
NULL);
+ else
+ read(ctx->completion_fd[0], &nr_events, sizeof(nr_events));
+
+ return nr_events;
+}
+
+int tap_aio_more_events(tap_aio_context_t *ctx)
+{
+ return io_getevents(ctx->aio_ctx, 0,
+ ctx->max_aio_events, ctx->aio_events, NULL);
+}
+
+
diff -r 865c4ae59be3 -r eeeb77195ac2 tools/blktap/drivers/tapaio.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap/drivers/tapaio.h Tue Jun 19 16:32:28 2007 +0100
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2006 Andrew Warfield and Julian Chesterfield
+ * Copyright (c) 2007 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __TAPAIO_H__
+#define __TAPAIO_H__
+
+#include <pthread.h>
+#include <libaio.h>
+
+struct tap_aio_context {
+ io_context_t aio_ctx;
+
+ struct io_event *aio_events;
+ int max_aio_events;
+
+ pthread_t aio_thread;
+ int command_fd[2];
+ int completion_fd[2];
+ int pollfd;
+ unsigned int poll_in_thread : 1;
+};
+
+typedef struct tap_aio_context tap_aio_context_t;
+
+int tap_aio_setup (tap_aio_context_t *ctx,
+ struct io_event *aio_events,
+ int max_aio_events);
+void tap_aio_continue (tap_aio_context_t *ctx);
+int tap_aio_get_events (tap_aio_context_t *ctx);
+int tap_aio_more_events(tap_aio_context_t *ctx);
+
+#endif /* __TAPAIO_H__ */
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|