This is a bonus patch for those wondering how a virtio implementation
can look. I have two, this is the more efficient one (needs some
modification for inter-guest though: it assumes the other end does all
the accessing of our memory. It's currently tacked on to the existing
lguest I/O mechanism as a demonstration, rather than replacing it.
It shows that it's possible to implement virtio without internal
locking.
Userspace server-side code isn't included.
===
This allows zero-copy from guest <-> host. It uses a page of
descriptors, a page to say what descriptors to use, and a page to say
what's been used: one each set for inbufs and one for outbufs.
TODO:
1) More polishing
2) Get rid of old I/O
3) Inter-guest I/O implementation
Signed-off-by: Rusty Russell <rusty@xxxxxxxxxxxxxxx>
---
drivers/lguest/Makefile | 2
drivers/lguest/hypercalls.c | 4
drivers/lguest/lguest_virtio.c | 511 +++++++++++++++++++++++++++++++++++++++
include/linux/lguest.h | 3
include/linux/lguest_launcher.h | 24 +
6 files changed, 948 insertions(+), 5 deletions(-)
--- a/drivers/lguest/Makefile
+++ b/drivers/lguest/Makefile
@@ -1,5 +1,5 @@
# Guest requires the paravirt_ops replacement and the bus driver.
-obj-$(CONFIG_LGUEST_GUEST) += lguest.o lguest_asm.o lguest_bus.o
+obj-$(CONFIG_LGUEST_GUEST) += lguest.o lguest_asm.o lguest_bus.o
lguest_virtio.o
# Host requires the other files, which can be a module.
obj-$(CONFIG_LGUEST) += lg.o
===================================================================
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -86,6 +86,10 @@ static void do_hcall(struct lguest *lg,
break;
case LHCALL_HALT:
lg->halted = 1;
+ break;
+ case LHCALL_NOTIFY:
+ lg->pending_key = regs->edx << PAGE_SHIFT;
+ lg->dma_is_pending = 1;
break;
default:
kill_guest(lg, "Bad hypercall %li\n", regs->eax);
===================================================================
--- /dev/null
+++ b/drivers/lguest/lguest_virtio.c
@@ -0,0 +1,511 @@
+/* Descriptor-based virtio backend using lguest. */
+
+/* FIXME: Put "running" in shared page so other side really doesn't
+ * send us interrupts. Then we would never need to "fail" restart.
+ * If there are more buffers when we set "running", simply ping other
+ * side. It would interrupt us back again.
+ */
+#define DEBUG
+#include <linux/lguest.h>
+#include <linux/lguest_bus.h>
+#include <linux/virtio.h>
+#include <linux/interrupt.h>
+#include <asm/io.h>
+
+#define NUM_DESCS (PAGE_SIZE / sizeof(struct lguest_desc))
+
+#ifdef DEBUG
+/* For development, we want to crash whenever the other side is bad. */
+#define BAD_SIDE(lgv, fmt...) \
+ do { dev_err(lgv->vdev.dev, fmt); BUG(); } while(0)
+#define START_USE(di) \
+ do { if ((di)->in_use) panic("in_use = %i\n", (di)->in_use);
(di)->in_use = __LINE__; mb(); } while(0)
+#define END_USE(di) \
+ do { BUG_ON(!(di)->in_use); (di)->in_use = 0; mb(); } while(0)
+#else
+#define BAD_SIDE(lgv, fmt...) \
+ do { dev_err(lgv->vdev.dev, fmt); (lgv)->broken = true; } while(0)
+#define START_USE(di)
+#define END_USE(di)
+#endif
+
+/* FIXME: make the device mem layout a struct, not a set of pointers */
+struct desc_info
+{
+ /* Page of descriptors. */
+ struct lguest_desc *desc;
+ /* How we tell other side what buffers are available. */
+ unsigned int *avail_idx;
+ unsigned int *available;
+ /* How other side tells us what's used. */
+ unsigned int *used_idx;
+ struct lguest_used *used;
+
+ /* Number of free buffers */
+ unsigned int num_free;
+ /* Head of free buffer list. */
+ unsigned int free_head;
+ /* Number we've added since last sync. */
+ unsigned int num_added;
+
+ /* Last used index we've seen. */
+ unsigned int last_used_idx;
+
+ /* Unless they told us to stop */
+ bool running;
+
+#ifdef DEBUG
+ /* They're supposed to lock for us. */
+ unsigned int in_use;
+#endif
+
+ /* Tokens for callbacks. */
+ void *data[NUM_DESCS];
+};
+
+/* FIXME: When doing this for real, vdev will go straight into lguest_device */
+struct lguest_virtio_device
+{
+ struct virtio_device vdev;
+ struct lguest_device *lg;
+ void *priv;
+
+ /* Other side has made a mess, don't try any more. */
+ bool broken;
+
+ struct desc_info in, out;
+};
+
+static inline struct lguest_virtio_device *
+vdev_to_lgv(struct virtio_device *vdev)
+{
+ return container_of(vdev, struct lguest_virtio_device, vdev);
+}
+
+static unsigned long add_buf(struct desc_info *di,
+ const struct scatterlist *sg,
+ unsigned int num,
+ void *data)
+{
+ unsigned int i, head, uninitialized_var(prev);
+
+ BUG_ON(data == NULL);
+ START_USE(di);
+
+ if (di->num_free < num) {
+ pr_debug("Can't add buf len %i - avail = %i\n", num,
+ di->num_free);
+ END_USE(di);
+ return -ENOSPC;
+ }
+
+ /* We're about to use some buffers from the free list. */
+ di->num_free -= num;
+
+ head = di->free_head;
+ for (i = di->free_head; num; i = di->desc[i].next, num--) {
+ di->desc[i].flags |= LGUEST_DESC_F_NEXT;
+ di->desc[i].pfn = page_to_pfn(sg[0].page);
+ di->desc[i].offset = sg[0].offset;
+ di->desc[i].len = sg[0].length;
+ prev = i;
+ sg++;
+ }
+ /* Last one doesn't continue. */
+ di->desc[prev].flags &= ~LGUEST_DESC_F_NEXT;
+
+ /* Update free pointer */
+ di->free_head = i;
+
+ di->data[head] = data;
+
+ /* Make sure it's all visible to other side before setting head. */
+ wmb();
+ di->desc[head].flags |= LGUEST_DESC_F_HEAD;
+
+ /* Put it in available array for advertising. */
+ di->available[(*di->avail_idx + di->num_added++) % NUM_DESCS] = head;
+
+ pr_debug("Added buffer head %i\n", head);
+ END_USE(di);
+ return head;
+}
+
+static unsigned long lguest_add_outbuf(struct virtio_device *vdev,
+ const struct scatterlist sg[],
+ unsigned int num,
+ void *data)
+{
+ struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+ BUG_ON(num > NUM_DESCS);
+ BUG_ON(num == 0);
+
+ return add_buf(&lgv->out, sg, num, data);
+}
+
+static unsigned long lguest_add_inbuf(struct virtio_device *vdev,
+ struct scatterlist sg[],
+ unsigned int num,
+ void *data)
+{
+ struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+ BUG_ON(num > NUM_DESCS);
+ BUG_ON(num == 0);
+
+ return add_buf(&lgv->in, sg, num, data);
+}
+
+static void lguest_sync(struct virtio_device *vdev, enum virtio_dir inout)
+{
+ struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+ if (inout & VIRTIO_IN)
+ START_USE(&lgv->in);
+ if (inout & VIRTIO_OUT)
+ START_USE(&lgv->out);
+ /* LGUEST_DESC_F_HEAD needs to be set before we say they're avail. */
+ wmb();
+
+ if (inout & VIRTIO_IN) {
+ *lgv->in.avail_idx += lgv->in.num_added;
+ lgv->in.num_added = 0;
+ }
+ if (inout & VIRTIO_OUT) {
+ *lgv->out.avail_idx += lgv->out.num_added;
+ lgv->out.num_added = 0;
+ }
+
+ /* Prod other side to tell it about changes. */
+ hcall(LHCALL_NOTIFY, lguest_devices[lgv->lg->index].pfn, 0, 0);
+ if (inout & VIRTIO_IN)
+ END_USE(&lgv->in);
+ if (inout & VIRTIO_OUT)
+ END_USE(&lgv->out);
+}
+
+static void detach_buf(struct desc_info *di, int id)
+{
+ unsigned int i;
+
+ BUG_ON(id >= NUM_DESCS);
+ BUG_ON(!(di->desc[id].flags & LGUEST_DESC_F_HEAD));
+
+ di->desc[id].flags &= ~LGUEST_DESC_F_HEAD;
+ /* Make sure other side has seen that it's detached. */
+ wmb();
+
+ /* Put back on free list: find end */
+ for (i = id; di->desc[i].flags&LGUEST_DESC_F_NEXT; i=di->desc[i].next)
+ di->num_free++;
+
+ di->desc[i].next = di->free_head;
+ di->free_head = id;
+ /* Plus final descriptor */
+ di->num_free++;
+}
+
+static void lguest_detach_outbuf(struct virtio_device *vdev, unsigned long id)
+{
+ struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+ START_USE(&lgv->out);
+ detach_buf(&lgv->out, id);
+ END_USE(&lgv->out);
+}
+
+static void lguest_detach_inbuf(struct virtio_device *vdev, unsigned long id)
+{
+ struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+ START_USE(&lgv->in);
+ detach_buf(&lgv->in, id);
+ END_USE(&lgv->in);
+}
+
+static bool more_used(struct desc_info *di)
+{
+ return di->last_used_idx != *di->used_idx;
+}
+
+static void *get_buf(struct desc_info *di, struct lguest_virtio_device *lgv,
+ unsigned int *len)
+{
+ unsigned int id;
+
+ START_USE(di);
+
+ if (!more_used(di)) {
+ END_USE(di);
+ return NULL;
+ }
+
+ /* Don't let them make us do infinite work. */
+ if (unlikely(*di->used_idx > di->last_used_idx + NUM_DESCS)) {
+ BAD_SIDE(lgv, "Too many descriptors");
+ return NULL;
+ }
+
+ id = di->used[di->last_used_idx%NUM_DESCS].id;
+ *len = di->used[di->last_used_idx%NUM_DESCS].len;
+
+ if (unlikely(id >= NUM_DESCS)) {
+ BAD_SIDE(lgv, "id %u out of range\n", id);
+ return NULL;
+ }
+ if (unlikely(!(di->desc[id].flags & LGUEST_DESC_F_HEAD))) {
+ BAD_SIDE(lgv, "id %u is not a head!\n", id);
+ return NULL;
+ }
+
+ detach_buf(di, id);
+ di->last_used_idx++;
+ BUG_ON(!di->data[id]);
+ END_USE(di);
+ return di->data[id];
+}
+
+static void *lguest_get_outbuf(struct virtio_device *vdev, unsigned int *len)
+{
+ struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+ return get_buf(&lgv->out, lgv, len);
+}
+
+static void *lguest_get_inbuf(struct virtio_device *vdev, unsigned int *len)
+{
+ struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+ return get_buf(&lgv->in, lgv, len);
+}
+
+static bool lguest_restart_in(struct virtio_device *vdev)
+{
+ struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+ START_USE(&lgv->in);
+ BUG_ON(lgv->in.running);
+
+ if (likely(!more_used(&lgv->in)) || unlikely(lgv->broken))
+ lgv->in.running = true;
+
+ END_USE(&lgv->in);
+ return lgv->in.running;
+}
+
+static bool lguest_restart_out(struct virtio_device *vdev)
+{
+ struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+ START_USE(&lgv->out);
+ BUG_ON(lgv->out.running);
+
+ if (likely(!more_used(&lgv->in)) || unlikely(lgv->broken))
+ lgv->in.running = true;
+
+ END_USE(&lgv->out);
+ return lgv->in.running;
+}
+
+static irqreturn_t lguest_virtio_interrupt(int irq, void *_lgv)
+{
+ struct lguest_virtio_device *lgv = _lgv;
+
+ if (unlikely(lgv->broken))
+ return IRQ_HANDLED;
+
+ if (lgv->out.running && more_used(&lgv->out))
+ lgv->out.running = lgv->vdev.driver_ops->out(&lgv->vdev);
+
+ if (lgv->in.running && more_used(&lgv->in))
+ lgv->in.running = lgv->vdev.driver_ops->in(&lgv->vdev);
+
+ return IRQ_HANDLED;
+}
+
+static struct virtio_ops lguest_virtio_ops = {
+ .add_outbuf = lguest_add_outbuf,
+ .add_inbuf = lguest_add_inbuf,
+ .sync = lguest_sync,
+ .detach_outbuf = lguest_detach_outbuf,
+ .detach_inbuf = lguest_detach_inbuf,
+ .get_outbuf = lguest_get_outbuf,
+ .get_inbuf = lguest_get_inbuf,
+ .restart_in = lguest_restart_in,
+ .restart_out = lguest_restart_out,
+};
+
+static struct lguest_virtio_device *lg_new_virtio(struct lguest_device *lgdev)
+{
+ struct lguest_virtio_device *lgv;
+ void *mem;
+ unsigned int i;
+
+ lgv = kmalloc(sizeof(*lgv), GFP_KERNEL);
+ if (!lgv)
+ return NULL;
+
+ memset(lgv, 0, sizeof(*lgv));
+
+ lgdev->private = lgv;
+ lgv->lg = lgdev;
+
+ /* Device mem is input pages followed by output pages */
+ mem = lguest_map(lguest_devices[lgdev->index].pfn<<PAGE_SHIFT, 6);
+ if (!mem)
+ goto free_lgv;
+ lgv->in.desc = mem;
+ lgv->in.avail_idx = mem + PAGE_SIZE;
+ lgv->in.available = (void *)(lgv->in.avail_idx + 1);
+ lgv->in.used_idx = mem + PAGE_SIZE*2;
+ lgv->in.used = (void *)(lgv->in.used_idx + 1);
+ lgv->out.desc = mem + PAGE_SIZE*3;
+ lgv->out.avail_idx = mem + PAGE_SIZE*4;
+ lgv->out.available = (void *)(lgv->out.avail_idx + 1);
+ lgv->out.used_idx = mem + PAGE_SIZE*5;
+ lgv->out.used = (void *)(lgv->out.used_idx + 1);
+
+ lgv->in.last_used_idx = lgv->out.last_used_idx = 0;
+ lgv->in.num_added = lgv->out.num_added = 0;
+ lgv->in.running = lgv->out.running = true;
+
+ /* Put everything in free lists. */
+ lgv->in.num_free = lgv->out.num_free = NUM_DESCS;
+ for (i = 0; i < NUM_DESCS-1; i++) {
+ lgv->in.desc[i].next = i+1;
+ lgv->out.desc[i].next = i+1;
+ }
+
+ lgv->vdev.ops = &lguest_virtio_ops;
+ lgv->vdev.dev = &lgdev->dev;
+ lgv->broken = false;
+ return lgv;
+
+free_lgv:
+ kfree(lgv);
+ return NULL;;
+}
+
+static void lg_destroy_virtio(struct lguest_virtio_device *lgv)
+{
+ lguest_unmap(lgv->in.desc);
+ kfree(lgv);
+}
+
+/* It's nice to have the name for the interrupt, so we do this separately
+ * from lg_new_virtio(). */
+static int lg_setup_interrupt(struct lguest_virtio_device *lgv,
+ const char *name)
+{
+ int irqf;
+
+ if (lguest_devices[lgv->lg->index].features&LGUEST_DEVICE_F_RANDOMNESS)
+ irqf = IRQF_SAMPLE_RANDOM;
+ else
+ irqf = 0;
+
+ return request_irq(lgdev_irq(lgv->lg), lguest_virtio_interrupt, irqf,
+ name, lgv);
+}
+
+/* Example network driver code. */
+#include <linux/virtio_net.h>
+#include <linux/etherdevice.h>
+
+static int lguest_virtnet_probe(struct lguest_device *lgdev)
+{
+ struct lguest_virtio_device *lgv;
+ struct net_device *dev;
+ u8 mac[ETH_ALEN];
+ int err;
+
+ lgv = lg_new_virtio(lgdev);
+ if (!lgv)
+ return -ENOMEM;
+
+ random_ether_addr(mac);
+ lgv->priv = dev = virtnet_probe(&lgv->vdev, mac);
+ if (IS_ERR(lgv->priv)) {
+ err = PTR_ERR(lgv->priv);
+ goto destroy;
+ }
+ err = lg_setup_interrupt(lgv, dev->name);
+ if (err)
+ goto unprobe;
+ return 0;
+
+unprobe:
+ virtnet_remove(dev);
+destroy:
+ lg_destroy_virtio(lgv);
+ return err;
+}
+
+static struct lguest_driver lguest_virtnet_drv = {
+ .name = "lguestvirtnet",
+ .owner = THIS_MODULE,
+ .device_type = LGUEST_DEVICE_T_VIRTNET,
+ .probe = lguest_virtnet_probe,
+};
+
+static __init int lguest_virtnet_init(void)
+{
+ return register_lguest_driver(&lguest_virtnet_drv);
+}
+device_initcall(lguest_virtnet_init);
+
+/* Example block driver code. */
+#include <linux/virtio_blk.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+static int lguest_virtblk_probe(struct lguest_device *lgdev)
+{
+ struct lguest_virtio_device *lgv;
+ struct gendisk *disk;
+ unsigned long sectors;
+ int err;
+
+ lgv = lg_new_virtio(lgdev);
+ if (!lgv)
+ return -ENOMEM;
+
+ /* Page is initially used to pass capacity. */
+ sectors = *(unsigned long *)lgv->in.desc;
+ *(unsigned long *)lgv->in.desc = 0;
+
+ lgv->priv = disk = virtblk_probe(&lgv->vdev);
+ if (IS_ERR(lgv->priv)) {
+ err = PTR_ERR(lgv->priv);
+ goto destroy;
+ }
+ set_capacity(disk, sectors);
+ blk_queue_max_hw_segments(disk->queue, NUM_DESCS-1);
+
+ err = lg_setup_interrupt(lgv, disk->disk_name);
+ if (err)
+ goto unprobe;
+ add_disk(disk);
+ return 0;
+
+unprobe:
+ virtblk_remove(disk);
+destroy:
+ lg_destroy_virtio(lgv);
+ return err;
+}
+
+static struct lguest_driver lguest_virtblk_drv = {
+ .name = "lguestvirtblk",
+ .owner = THIS_MODULE,
+ .device_type = LGUEST_DEVICE_T_VIRTBLK,
+ .probe = lguest_virtblk_probe,
+};
+
+static __init int lguest_virtblk_init(void)
+{
+ return register_lguest_driver(&lguest_virtblk_drv);
+}
+device_initcall(lguest_virtblk_init);
+
+MODULE_LICENSE("GPL");
===================================================================
--- a/include/linux/lguest.h
+++ b/include/linux/lguest.h
@@ -23,6 +23,9 @@
#define LHCALL_SET_PTE 14
#define LHCALL_SET_PMD 15
#define LHCALL_LOAD_TLS 16
+
+/* Experimental hcalls for new I/O */
+#define LHCALL_NOTIFY 100 /* pfn */
#define LG_CLOCK_MIN_DELTA 100UL
#define LG_CLOCK_MAX_DELTA ULONG_MAX
===================================================================
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -44,6 +44,8 @@ struct lguest_device_desc {
#define LGUEST_DEVICE_T_CONSOLE 1
#define LGUEST_DEVICE_T_NET 2
#define LGUEST_DEVICE_T_BLOCK 3
+#define LGUEST_DEVICE_T_VIRTNET 8
+#define LGUEST_DEVICE_T_VIRTBLK 9
u16 features;
#define LGUEST_NET_F_NOCSUM 0x4000 /* Don't bother checksumming */
@@ -70,4 +72,26 @@ enum lguest_req
LHREQ_IRQ, /* + irq */
LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */
};
+
+/* This marks a buffer as being the start (and active) */
+#define LGUEST_DESC_F_HEAD 1
+/* This marks a buffer as continuing via the next field. */
+#define LGUEST_DESC_F_NEXT 2
+
+/* Virtio descriptors */
+struct lguest_desc
+{
+ unsigned long pfn;
+ unsigned long len;
+ u16 offset;
+ u16 flags;
+ /* We chain unused descriptors via this, too */
+ u32 next;
+};
+
+struct lguest_used
+{
+ unsigned int id;
+ unsigned int len;
+};
#endif /* _ASM_LGUEST_USER */
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|