[Xen-devel] [PATCH RFC 0/3] Virtio draft III

In this episode, Rusty tries to NAPI-ize the driver and discovers that
virtio callbacks are a bad idea: NAPI needs to turn interrupts off and
still be able to query for new incoming packets.

Changes to core:
1) Back to "interrupt" model with get_inbuf()/get_outbuf() calls.
2) Clearer rules for locking: in calls cannot overlap, out calls cannot
overlap, but in can overlap out.
3) Methods for suppressing/enabling "interrupt" calls.

Changes to example net driver:
1) NAPI, locking is now correct (and there is none)

Changes to example block driver:
1) Relay SCSI ioctls (particularly CDROMEJECT) for optional server
support (VIRTIO_BLK_T_SCSI_CMD).
2) /dev/vb -> /dev/vd.
3) Barrier support.
4) Header w/ definitions can be included from userspace.

Here's the inter-diff for the three:

diff -u b/include/linux/virtio.h b/include/linux/virtio.h
--- b/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -7,69 +7,109 @@
 /**
  * virtio_device - description and routines to drive a virtual device.
- * @lock: the lock to hold before calling any functions.
  * @dev: the underlying struct device.
  * @ops: the operations for this virtual device.
+ * @driver_ops: set by the driver for callbacks.
  * @priv: private pointer for the driver to use.
  */
 struct virtio_device {
-       spinlock_t lock;
        struct device *dev;
        struct virtio_ops *ops;
+       struct virtio_driver_ops *driver_ops;
        void *priv;
 };
 
 /**
+ * virtio_driver_ops - driver callbacks for a virtual device.
+ * @in: inbufs have been completed.
+ *     Usually called from an interrupt handler.
+ *     Return false to suppress further inbuf callbacks.
+ * @out: outbufs have been completed.
+ *     Usually called from an interrupt handler.
+ *     Return false to suppress further outbuf callbacks.
+ */
+struct virtio_driver_ops {
+       bool (*in)(struct virtio_device *dev);
+       bool (*out)(struct virtio_device *dev);
+};
+
+enum virtio_dir {
+       VIRTIO_IN = 0x1,
+       VIRTIO_OUT = 0x2,
+};
+
+/**
  * virtio_ops - virtio abstraction layer
  * @add_outbuf: prepare to send data to the other end:
  *     vdev: the virtio_device
  *     sg: the description of the buffer(s).
  *     num: the size of the sg array.
- *     cb: the function to call once the outbuf is finished & detached.
- *     data: the token to hand to the cb function.
- *      Returns a unique id or an error.  Note that the callback will be
- *     called with the lock held, and possibly in an interrupt handler.
+ *     data: the token returned by the get_outbuf function.
+ *      Returns a unique id or an error.
  * @add_inbuf: prepare to receive data from the other end:
  *     vdev: the virtio_device
  *     sg: the description of the buffer(s).
  *     num: the size of the sg array.
- *     cb: the function to call once the inbuf is finished & detached.
- *     data: the token to hand to the cb function.
- *      Returns a unique id or an error (eg. -ENOSPC).  Note that the
- *     callback will be called with the lock held, and possibly in an
- *     interrupt handler.
- * @sync: update after add_inbuf/add_outbuf
+ *     data: the token returned by the get_inbuf function.
+ *      Returns a unique id or an error (eg. -ENOSPC).
+ * @sync: update after add_inbuf and/or add_outbuf
  *     vdev: the virtio_device we're talking about.
+ *     inout: VIRTIO_IN and/or VIRTIO_OUT
  *     After one or more add_inbuf/add_outbuf calls, invoke this to kick
  *     the virtio layer.
+ * @get_outbuf: get the next used outbuf.
+ *     vdev: the virtio_device we're talking about.
+ *     len: the length written into the outbuf
+ *     Returns NULL or the "data" token handed to add_outbuf (which has been
+ *     detached).
+ * @get_inbuf: get the next used inbuf.
+ *     vdev: the virtio_device we're talking about.
+ *     len: the length read from the inbuf
+ *     Returns NULL or the "data" token handed to add_inbuf (which has been
+ *     detached).
  * @detach_outbuf: make sure sent sg can no longer be read.
  *     vdev: the virtio_device we're talking about.
  *     id: the id returned from add_outbuf.
- *     This is not necessary (or valid!) if the outbuf callback has
- *     already fired.
+ *     This is usually used for shutdown.  Don't try to detach twice.
  * @detach_inbuf: make sure sent sg can no longer be written to.
  *     vdev: the virtio_device we're talking about.
  *     id: the id returned from add_inbuf.
- *     This is not necessary (or valid!) if the outbuf callback has
- *     already fired.
+ *     This is usually used for shutdown.  Don't try to detach twice.
+ * @restart_in: restart calls to driver_ops->in after it returned false.
+ *     vdev: the virtio_device we're talking about.
+ *     This returns "false" (and doesn't re-enable) if there are pending
+ *     inbufs, to avoid a race.
+ * @restart_out: restart calls to driver_ops->out after it returned false.
+ *     vdev: the virtio_device we're talking about.
+ *     This returns "false" (and doesn't re-enable) if there are pending
+ *     outbufs, to avoid a race.
+ *
+ * Locking rules are straightforward: the driver is responsible for
+ * locking.  Outbuf operations can be called in parallel to inbuf
+ * operations, but no two outbuf operations nor two inbuf operations
+ * may be invoked simultaneously.
+ *
+ * All operations can be called in any context.
  */
 struct virtio_ops {
        unsigned long (*add_outbuf)(struct virtio_device *vdev,
                                    const struct scatterlist sg[],
                                    unsigned int num,
-                                   void (*cb)(struct virtio_device *vdev,
-                                              void *data, unsigned len),
                                    void *data);
 
        unsigned long (*add_inbuf)(struct virtio_device *vdev,
                                   struct scatterlist sg[],
                                   unsigned int num,
-                                  void (*cb)(struct virtio_device *vdev,
-                                             void *data, unsigned len),
                                   void *data);
 
-       void (*sync)(struct virtio_device *vdev);
+       void (*sync)(struct virtio_device *vdev, enum virtio_dir inout);
+
+       void *(*get_outbuf)(struct virtio_device *vdev, unsigned int *len);
+       void *(*get_inbuf)(struct virtio_device *vdev, unsigned int *len);
 
        void (*detach_outbuf)(struct virtio_device *vdev, unsigned long id);
        void (*detach_inbuf)(struct virtio_device *vdev, unsigned long id);
+
+       bool (*restart_in)(struct virtio_device *vdev);
+       bool (*restart_out)(struct virtio_device *vdev);
 };
 #endif /* _LINUX_VIRTIO_H */
diff -u b/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
--- b/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -33,29 +33,21 @@
        struct virtio_device *vdev;
        struct net_device *ndev;
 
+       /* Number of input buffers, and max we've ever had. */
+       unsigned int num, max;
+
        /* Receive & send queues. */
        struct sk_buff_head recv;
        struct sk_buff_head send;
-
-       /* Transmitted packets waiting to be freed */
-       struct sk_buff_head free;
 };
 
-static void skb_xmit_done(struct virtio_device *vdev, void *_skb, unsigned len)
+static bool skb_xmit_done(struct virtio_device *vdev)
 {
        struct virtnet_info *vi = vdev->priv;
-       struct sk_buff *skb = _skb;
-
-       assert_spin_locked(&vdev->lock);
-
-       __skb_unlink(skb, &vi->send);
-       vi->ndev->stats.tx_bytes += len;
-       vi->ndev->stats.tx_packets++;
-       __skb_queue_head(&vi->free, skb);
 
-       pr_debug("Sent skb %p\n", skb);
        /* In case we were waiting for output buffers. */
        netif_wake_queue(vi->ndev);
+       return true;
 }
 
 static void receive_skb(struct net_device *dev, struct sk_buff *skb,
@@ -78,48 +70,90 @@
        netif_rx(skb);
 }
 
-static void skb_recv_done(struct virtio_device *, void *, unsigned);
-static int try_fill_recv(struct virtnet_info *vi)
+static void try_fill_recv(struct virtnet_info *vi)
 {
        struct sk_buff *skb;
        struct scatterlist sg[MAX_SKB_FRAGS];
-       unsigned long num, id;
-
-       assert_spin_locked(&vi->vdev->lock);
+       unsigned long sgnum, id;
 
-       skb = netdev_alloc_skb(vi->ndev, MAX_PACKET_LEN);
-       if (unlikely(!skb))
-               return -ENOMEM;
+       for (;;) {
+               skb = netdev_alloc_skb(vi->ndev, MAX_PACKET_LEN);
+               if (unlikely(!skb))
+                       break;
+
+               skb_put(skb, MAX_PACKET_LEN);
+               sgnum = skb_to_sgvec(skb, sg, 0, skb->len);
+               skb_queue_head(&vi->recv, skb);
+
+               id = vi->vdev->ops->add_inbuf(vi->vdev, sg, sgnum, skb);
+               if (IS_ERR_VALUE(id)) {
+                       skb_unlink(skb, &vi->recv);
+                       kfree_skb(skb);
+                       break;
+               }
+               vi->num++;
+       }
+       if (unlikely(vi->num > vi->max))
+               vi->max = vi->num;
+       vi->vdev->ops->sync(vi->vdev, VIRTIO_IN);
+}
 
-       skb_put(skb, MAX_PACKET_LEN);
-       num = skb_to_sgvec(skb, sg, 0, skb->len);
-       skb_queue_head(&vi->recv, skb);
+static bool skb_recv_done(struct virtio_device *vdev)
+{
+       struct virtnet_info *vi = vdev->priv;
 
-       id = vi->vdev->ops->add_inbuf(vi->vdev, sg, num, skb_recv_done, skb);
-       if (IS_ERR_VALUE(id)) {
-               skb_unlink(skb, &vi->recv);
-               kfree_skb(skb);
-               return id;
-       }
-       return 0;
+       netif_rx_schedule(vi->ndev);
+       /* Suppress further interrupts. */
+       return false;
 }
 
-static void skb_recv_done(struct virtio_device *vdev, void *_skb, unsigned len)
+static int virtnet_poll(struct net_device *dev, int *budget)
 {
-       struct virtnet_info *vi = vdev->priv;
-       struct sk_buff *skb = _skb;
+       struct virtnet_info *vi = netdev_priv(dev);
+       struct sk_buff *skb = NULL;
+       unsigned int len, received = 0;
 
-       assert_spin_locked(&vdev->lock);
-       __skb_unlink(skb, &vi->recv);
-       receive_skb(vi->ndev, skb, len);
-       try_fill_recv(vi);
+again:
+       while (received < dev->quota &&
+              (skb = vi->vdev->ops->get_inbuf(vi->vdev, &len)) != NULL) {
+               __skb_unlink(skb, &vi->recv);
+               receive_skb(vi->ndev, skb, len);
+               vi->num--;
+               received++;
+       }
+
+        dev->quota -= received;
+        *budget -= received;
+
+       /* FIXME: If we oom and completely run out of inbufs, we need
+        * to start a timer trying to fill more. */
+       if (vi->num < vi->max / 2)
+               try_fill_recv(vi);
+
+       /* Still more work to do? */
+       if (skb)
+               return 1; /* not done */
+
+       netif_rx_complete(dev);
+       if (unlikely(!vi->vdev->ops->restart_in(vi->vdev))
+           && netif_rx_reschedule(dev, received))
+               goto again;
+
+       return 0;
 }
 
-static void free_old_skbs(struct sk_buff_head *free)
+static void free_old_xmit_skbs(struct virtnet_info *vi)
 {
        struct sk_buff *skb;
-       while ((skb = __skb_dequeue(free)) != NULL)
+       unsigned int len;
+
+       while ((skb = vi->vdev->ops->get_outbuf(vi->vdev, &len)) != NULL) {
+               pr_debug("Sent skb %p\n", skb);
+               __skb_unlink(skb, &vi->send);
+               vi->ndev->stats.tx_bytes += len;
+               vi->ndev->stats.tx_packets++;
                kfree_skb(skb);
+       }
 }
 
 static int start_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -128,19 +162,16 @@
        unsigned long num, id;
        struct scatterlist sg[MAX_SKB_FRAGS];
        const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
-       unsigned long flags;
 
        pr_debug("%s: xmit %p %02x:%02x:%02x:%02x:%02x:%02x\n",
                 dev->name, skb,
                 dest[0], dest[1], dest[2], dest[3], dest[4], dest[5]);
 
-       spin_lock_irqsave(&vi->vdev->lock, flags);
-       /* Free any transmitted packets: not supposed to do it in interrupt */
-       free_old_skbs(&vi->free);
+       free_old_xmit_skbs(vi);
 
        num = skb_to_sgvec(skb, sg, 0, skb->len);
        __skb_queue_head(&vi->send, skb);
-       id = vi->vdev->ops->add_outbuf(vi->vdev, sg, num, skb_xmit_done, skb);
+       id = vi->vdev->ops->add_outbuf(vi->vdev, sg, num, skb);
        if (IS_ERR_VALUE(id)) {
                pr_debug("%s: virtio not prepared to send\n", dev->name);
                skb_unlink(skb, &vi->send);
@@ -148,8 +179,7 @@
                return NETDEV_TX_BUSY;
        }
        SKB_ID(skb) = id;
-       vi->vdev->ops->sync(vi->vdev);
-       spin_unlock_irqrestore(&vi->vdev->lock, flags);
+       vi->vdev->ops->sync(vi->vdev, VIRTIO_OUT);
 
        return 0;
 }
@@ -157,16 +187,12 @@
 static int virtnet_open(struct net_device *dev)
 {
        struct virtnet_info *vi = netdev_priv(dev);
-       int i, err;
 
-       spin_lock_irq(&vi->vdev->lock);
-       for (i = 0; (err = try_fill_recv(vi)) == 0; i++);
-       vi->vdev->ops->sync(vi->vdev);
-       spin_unlock_irq(&vi->vdev->lock);
+       try_fill_recv(vi);
 
        /* If we didn't even get one input buffer, we're useless. */
-       if (i == 0)
-               return err;
+       if (vi->num == 0)
+               return -ENOMEM;
 
        return 0;
 }
@@ -176,20 +202,26 @@
        struct virtnet_info *vi = netdev_priv(dev);
        struct sk_buff *skb;
 
-       spin_lock_irq(&vi->vdev->lock);
+       /* networking core has neutered skb_xmit_done/skb_recv_done, so don't
+        * worry about races vs. get_buf(). */
        while ((skb = __skb_dequeue(&vi->recv)) != NULL) {
                vi->vdev->ops->detach_inbuf(vi->vdev, SKB_ID(skb));
                kfree_skb(skb);
+               vi->num--;
        }
        while ((skb = __skb_dequeue(&vi->send)) != NULL) {
                vi->vdev->ops->detach_outbuf(vi->vdev, SKB_ID(skb));
                kfree_skb(skb);
        }
-       free_old_skbs(&vi->free);
-       spin_unlock_irq(&vi->vdev->lock);
+       BUG_ON(vi->num != 0);
        return 0;
 }
 
+static struct virtio_driver_ops virtnet_ops = {
+       .in = skb_recv_done,
+       .out = skb_xmit_done,
+};
+
 struct net_device *virtnet_probe(struct virtio_device *vdev,
                                 const u8 mac[ETH_ALEN])
 {
@@ -207,16 +239,18 @@
        memcpy(dev->dev_addr, mac, ETH_ALEN);
        dev->open = virtnet_open;
        dev->stop = virtnet_close;
+       dev->poll = virtnet_poll;
        dev->hard_start_xmit = start_xmit;
+       dev->weight = 16;
        SET_NETDEV_DEV(dev, vdev->dev);
 
        vi = netdev_priv(dev);
        vi->vdev = vdev;
        vi->ndev = dev;
        vdev->priv = vi;
+       vdev->driver_ops = &virtnet_ops;
        skb_queue_head_init(&vi->recv);
        skb_queue_head_init(&vi->send);
-       skb_queue_head_init(&vi->free);
 
        err = register_netdev(dev);
        if (err) {
diff -u b/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
--- b/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -1,4 +1,4 @@
-//#define DEBUG
+#define DEBUG
 #include <linux/spinlock.h>
 #include <linux/blkdev.h>
 #include <linux/hdreg.h>
@@ -8,6 +8,8 @@
 static unsigned char virtblk_index = 'a';
 struct virtio_blk
 {
+       spinlock_t lock;
+
        struct virtio_device *vdev;
 
        /* The disk structure for the kernel. */
@@ -19,7 +21,7 @@
        mempool_t *pool;
 
        /* Scatterlist: can be too big for stack. */
-       struct scatterlist sg[1+MAX_PHYS_SEGMENTS];
+       struct scatterlist sg[2+MAX_PHYS_SEGMENTS];
 };
 
 struct virtblk_req
@@ -28,68 +30,94 @@
        struct request *req;
        unsigned long out_id;
        bool out_done, in_done;
-       bool failed;
+       int uptodate;
        struct virtio_blk_outhdr out_hdr;
        struct virtio_blk_inhdr in_hdr;
 };
 
-/* Jens gave me this nice helper to end all chunks of a request. */
-static void end_dequeued_request(struct request *req, int uptodate)
+static void end_dequeued_request(struct request *req,
+                                request_queue_t *q, int uptodate)
 {
-       if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
+       /* And so the insanity of the block layer infects us here. */
+       int nsectors = req->hard_nr_sectors;
+
+       if (blk_pc_request(req)) {
+               nsectors = (req->data_len + 511) >> 9;
+               if (!nsectors)
+                       nsectors = 1;
+               printk("uptodate = %i\n", uptodate);
+       }
+       if (end_that_request_first(req, uptodate, nsectors))
                BUG();
        add_disk_randomness(req->rq_disk);
        end_that_request_last(req, uptodate);
 }
 
-static void finish(struct virtio_blk *vblk, struct virtblk_req *vbr)
+static bool finish(struct virtio_blk *vblk, struct virtblk_req *vbr)
 {
-       end_dequeued_request(vbr->req, !vbr->failed);
+       if (!vbr->in_done || !vbr->out_done)
+               return false;
+       end_dequeued_request(vbr->req, vblk->disk->queue, vbr->uptodate);
        list_del(&vbr->list);
        mempool_free(vbr, vblk->pool);
-       /* In case queue is stopped waiting for more buffers. */
-       blk_start_queue(vblk->disk->queue);
+       return true;
 }
 
 /* We make sure they finished both the input and output buffers: otherwise
  * they might still have read access after we free them. */
-static void blk_out_done(struct virtio_device *vdev, void *_vbr, unsigned len)
+static bool blk_out_done(struct virtio_device *vdev)
 {
-       struct virtblk_req *vbr = _vbr;
        struct virtio_blk *vblk = vdev->priv;
+       struct virtblk_req *vbr;
+       unsigned int len, finished = 0;
+       unsigned long flags;
 
-       assert_spin_locked(&vblk->vdev->lock);
-
-       BUG_ON(vbr->out_done);
-       vbr->out_done = true;
-       if (vbr->in_done)
-               finish(vblk, vbr);
+       spin_lock_irqsave(&vblk->lock, flags);
+       while ((vbr = vdev->ops->get_outbuf(vdev, &len)) != NULL) {
+               BUG_ON(vbr->out_done);
+               vbr->out_done = true;
+               finished += finish(vblk, vbr);
+       }
+       /* In case queue is stopped waiting for more buffers. */
+       if (finished)
+               blk_start_queue(vblk->disk->queue);
+       spin_unlock_irqrestore(&vblk->lock, flags);
+       return true;
 }
 
-static void blk_in_done(struct virtio_device *vdev, void *_vbr, unsigned len)
+static bool blk_in_done(struct virtio_device *vdev)
 {
-       struct virtblk_req *vbr = _vbr;
        struct virtio_blk *vblk = vdev->priv;
-       unsigned long expected_len;
+       struct virtblk_req *vbr;
+       unsigned int len, finished = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&vblk->lock, flags);
 
-       assert_spin_locked(&vblk->vdev->lock);
+       while ((vbr = vdev->ops->get_inbuf(vdev, &len)) != NULL) {
+               BUG_ON(vbr->in_done);
 
-       expected_len = sizeof(vbr->in_hdr);
-       if (vbr->out_hdr.type == READ)
-               expected_len += vbr->req->hard_nr_sectors*512;
-
-       if (unlikely(len != expected_len)) {
-               dev_err(vblk->vdev->dev, "short reply %u not %lu",
-                       len, expected_len);
-               vbr->failed = true;
-       } else if (unlikely(vbr->in_hdr.status != 1)) {
-               vbr->failed = true;
+               switch (vbr->in_hdr.status) {
+               case VIRTIO_BLK_S_OK:
+                       vbr->uptodate = 1;
+                       break;
+               case VIRTIO_BLK_S_UNSUPP:
+                       printk("Request was unsupported\n");
+                       vbr->uptodate = -ENOTTY;
+                       break;
+               default:
+                       vbr->uptodate = 0;
+                       break;
+               }
+               vbr->in_done = true;
+               finished += finish(vblk, vbr);
        }
 
-       BUG_ON(vbr->in_done);
-       vbr->in_done = true;
-       if (vbr->out_done)
-               finish(vblk, vbr);
+       /* In case queue is stopped waiting for more buffers. */
+       if (finished)
+               blk_start_queue(vblk->disk->queue);
+       spin_unlock_irqrestore(&vblk->lock, flags);
+       return true;
 }
 
 static bool do_write(request_queue_t *q, struct virtio_blk *vblk,
@@ -97,12 +125,14 @@
 {
        unsigned long num;
 
+       vbr->out_hdr.type |= VIRTIO_BLK_T_WRITE;
+
        /* Set up for reply. */
        vblk->sg[0].page = virt_to_page(&vbr->in_hdr);
        vblk->sg[0].offset = offset_in_page(&vbr->in_hdr);
        vblk->sg[0].length = sizeof(vbr->in_hdr);
        vbr->out_hdr.id = vblk->vdev->ops->add_inbuf(vblk->vdev, vblk->sg, 1,
-                                                    blk_in_done, vbr);
+                                                    vbr);
        if (IS_ERR_VALUE(vbr->out_hdr.id))
                goto full;
 
@@ -112,15 +142,13 @@
        vblk->sg[0].length = sizeof(vbr->out_hdr);
 
        num = blk_rq_map_sg(q, vbr->req, vblk->sg+1);
-       vbr->out_done = vbr->in_done = false;
-       vbr->failed = false;
        vbr->out_id = vblk->vdev->ops->add_outbuf(vblk->vdev, vblk->sg, 1+num,
-                                                 blk_out_done, vbr);
+                                                 vbr);
        if (IS_ERR_VALUE(vbr->out_id))
                goto detach_inbuf_full;
 
        pr_debug("Write: %p in=%lu out=%lu\n", vbr,
-                vbr->out_hdr.id, vbr->out_id);
+                (long)vbr->out_hdr.id, (long)vbr->out_id);
        list_add_tail(&vbr->list, &vblk->reqs);
        return true;
 
@@ -135,13 +163,15 @@
 {
        unsigned long num;
 
+       vbr->out_hdr.type |= VIRTIO_BLK_T_READ;
+
        /* Set up for reply. */
        vblk->sg[0].page = virt_to_page(&vbr->in_hdr);
        vblk->sg[0].offset = offset_in_page(&vbr->in_hdr);
        vblk->sg[0].length = sizeof(vbr->in_hdr);
        num = blk_rq_map_sg(q, vbr->req, vblk->sg+1);
        vbr->out_hdr.id = vblk->vdev->ops->add_inbuf(vblk->vdev, vblk->sg,
-                                                    1+num, blk_in_done, vbr);
+                                                    1+num, vbr);
        if (IS_ERR_VALUE(vbr->out_hdr.id))
                goto full;
 
@@ -149,15 +179,53 @@
        vblk->sg[0].offset = offset_in_page(&vbr->out_hdr);
        vblk->sg[0].length = sizeof(vbr->out_hdr);
 
-       vbr->out_done = vbr->in_done = false;
-       vbr->failed = false;
        vbr->out_id = vblk->vdev->ops->add_outbuf(vblk->vdev, vblk->sg, 1,
-                                                 blk_out_done, vbr);
+                                                 vbr);
        if (IS_ERR_VALUE(vbr->out_id))
                goto detach_inbuf_full;
 
        pr_debug("Read: %p in=%lu out=%lu\n", vbr,
-                vbr->out_hdr.id, vbr->out_id);
+                (long)vbr->out_hdr.id, (long)vbr->out_id);
+       list_add_tail(&vbr->list, &vblk->reqs);
+       return true;
+
+detach_inbuf_full:
+       vblk->vdev->ops->detach_inbuf(vblk->vdev, vbr->out_hdr.id);
+full:
+       return false;
+}
+
+static bool do_scsi(request_queue_t *q, struct virtio_blk *vblk,
+                   struct virtblk_req *vbr)
+{
+       unsigned long num;
+
+       vbr->out_hdr.type |= VIRTIO_BLK_T_SCSI_CMD;
+
+       /* Set up for reply. */
+       vblk->sg[0].page = virt_to_page(&vbr->in_hdr);
+       vblk->sg[0].offset = offset_in_page(&vbr->in_hdr);
+       vblk->sg[0].length = sizeof(vbr->in_hdr);
+       vbr->out_hdr.id = vblk->vdev->ops->add_inbuf(vblk->vdev, vblk->sg, 1,
+                                                    vbr);
+       if (IS_ERR_VALUE(vbr->out_hdr.id))
+               goto full;
+
+       vblk->sg[0].page = virt_to_page(&vbr->out_hdr);
+       vblk->sg[0].offset = offset_in_page(&vbr->out_hdr);
+       vblk->sg[0].length = sizeof(vbr->out_hdr);
+       vblk->sg[1].page = virt_to_page(vbr->req->cmd);
+       vblk->sg[1].offset = offset_in_page(vbr->req->cmd);
+       vblk->sg[1].length = vbr->req->cmd_len;
+
+       num = blk_rq_map_sg(q, vbr->req, vblk->sg+1);
+       vbr->out_id = vblk->vdev->ops->add_outbuf(vblk->vdev, vblk->sg, 2+num,
+                                                 vbr);
+       if (IS_ERR_VALUE(vbr->out_id))
+               goto detach_inbuf_full;
+
+       pr_debug("Scsi: %p in=%lu out=%lu\n", vbr,
+                (long)vbr->out_hdr.id, (long)vbr->out_id);
        list_add_tail(&vbr->list, &vblk->reqs);
        return true;
 
@@ -176,37 +244,38 @@
        while ((req = elv_next_request(q)) != NULL) {
                vblk = req->rq_disk->private_data;
 
-               /* FIXME: handle these iff capable. */
-               if (!blk_fs_request(req)) {
-                       pr_debug("Got non-command 0x%08x\n", req->cmd_type);
-                       req->errors++;
-                       blkdev_dequeue_request(req);
-                       end_dequeued_request(req, 0);
-                       continue;
-               }
-
                vbr = mempool_alloc(vblk->pool, GFP_ATOMIC);
                if (!vbr)
                        goto stop;
 
                BUG_ON(req->nr_phys_segments > ARRAY_SIZE(vblk->sg));
                vbr->req = req;
-               vbr->out_hdr.type = rq_data_dir(req);
+               /* Actual type gets or'ed in do_scsi/do_write/do_read */
+               vbr->out_hdr.type = blk_barrier_rq(req)?VIRTIO_BLK_T_BARRIER:0;
                vbr->out_hdr.sector = req->sector;
+               vbr->out_hdr.ioprio = req->ioprio;
+               vbr->out_done = vbr->in_done = false;
 
-               if (rq_data_dir(req) == WRITE) {
-                       if (!do_write(q, vblk, vbr))
-                               goto stop;
-               } else {
-                       if (!do_read(q, vblk, vbr))
+               if (blk_pc_request(req)) {
+                       if (!do_scsi(q, vblk, vbr))
                                goto stop;
-               }
+               } else if (blk_fs_request(req)) {
+                       if (rq_data_dir(req) == WRITE) {
+                               if (!do_write(q, vblk, vbr))
+                                       goto stop;
+                       } else {
+                               if (!do_read(q, vblk, vbr))
+                                       goto stop;
+                       }
+               } else
+                       /* We don't put anything else in the queue. */
+                       BUG();
                blkdev_dequeue_request(req);
        }
 
 sync:
        if (vblk)
-               vblk->vdev->ops->sync(vblk->vdev);
+               vblk->vdev->ops->sync(vblk->vdev, VIRTIO_IN|VIRTIO_OUT);
        return;
 
 stop:
@@ -216,7 +285,21 @@
        goto sync;
 }
 
+static int virtblk_ioctl(struct inode *inode, struct file *filp,
+                        unsigned cmd, unsigned long data)
+{
+       return scsi_cmd_ioctl(filp, inode->i_bdev->bd_disk, cmd,
+                             (void __user *)data);
+}
+
+static struct virtio_driver_ops virtblk_ops = {
+       .in = blk_in_done,
+       .out = blk_out_done,
+};
+
+
 static struct block_device_operations virtblk_fops = {
+       .ioctl = virtblk_ioctl,
        .owner = THIS_MODULE,
 };
 
@@ -232,8 +315,10 @@
        }
 
        INIT_LIST_HEAD(&vblk->reqs);
+       spin_lock_init(&vblk->lock);
        vblk->vdev = vdev;
        vdev->priv = vblk;
+       vdev->driver_ops = &virtblk_ops;
 
        vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req));
        if (!vblk->pool) {
@@ -254,19 +339,20 @@
                goto out_unregister_blkdev;
        }
 
-       vblk->disk->queue = blk_init_queue(do_virtblk_request,
-                                          &vblk->vdev->lock);
+       vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock);
        if (!vblk->disk->queue) {
                err = -ENOMEM;
                goto out_put_disk;
        }
 
-       sprintf(vblk->disk->disk_name, "vb%c", virtblk_index++);
+       sprintf(vblk->disk->disk_name, "vd%c", virtblk_index++);
        vblk->disk->major = major;
        vblk->disk->first_minor = 0;
        vblk->disk->private_data = vblk;
        vblk->disk->fops = &virtblk_fops;
 
+       blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_TAG, NULL);
+
        /* Caller can do blk_queue_max_hw_segments(), set_capacity()
         * etc then add_disk(). */
        return vblk->disk;
diff -u b/include/linux/virtio_blk.h b/include/linux/virtio_blk.h
--- b/include/linux/virtio_blk.h
+++ b/include/linux/virtio_blk.h
@@ -3,26 +3,37 @@
 #include <linux/types.h>
-struct gendisk;
-struct virtio_device;
-struct hd_geometry;
+
+#define VIRTIO_BLK_T_READ      0
+#define VIRTIO_BLK_T_WRITE 1
+#define VIRTIO_BLK_T_SCSI_CMD 3
+#define VIRTIO_BLK_T_BARRIER 0x80000000 /* Barrier before this op. */
 
 /* This is the first element of the scatter-gather list. */
 struct virtio_blk_outhdr
 {
-       /* 0 == read, 1 == write */
-       u32 type;
+       /* VIRTIO_BLK_T* */
+       __u32 type;
+       /* io priority. */
+       __u32 ioprio;
        /* Sector (ie. 512 byte offset) */
-       unsigned long sector;
+       __u64 sector;
        /* Where to put reply. */
-       unsigned long id;
+       __u64 id;
 };
 
+#define VIRTIO_BLK_S_OK                0
+#define VIRTIO_BLK_S_IOERR     1
+#define VIRTIO_BLK_S_UNSUPP    2
+
 struct virtio_blk_inhdr
 {
-       /* 1 = OK, 0 = not ok. */
-       unsigned long status;
+       unsigned char status;
 };
 
+#ifdef __KERNEL__
+struct gendisk;
+struct virtio_device;
+
 struct gendisk *virtblk_probe(struct virtio_device *vdev);
 void virtblk_remove(struct gendisk *disk);
-
+#endif /* __KERNEL__ */
 #endif /* _LINUX_VIRTIO_BLK_H */
only in patch2:
unchanged:
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -341,6 +341,7 @@ unifdef-y += utsname.h
 unifdef-y += utsname.h
 unifdef-y += videodev2.h
 unifdef-y += videodev.h
+unifdef-y += virtio_blk.h
 unifdef-y += wait.h
 unifdef-y += wanrouter.h
 unifdef-y += watchdog.h





_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
WARNING - OLD ARCHIVES

xen-devel

[Xen-devel] [PATCH RFC 0/3] Virtio draft III