[Xen-devel] Re: [patch] CFQ for xen domains

Gerd Knorr wrote:

  Hi folks,
New version of the patch, adapted to apply cleanly against latest


One more version, this time against 3.0-final ;)

cheers,

  Gerd

diff -r 0255f48b757f linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c        Sun Dec  4 
19:12:00 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c        Tue Dec  6 
15:29:06 2005
@@ -12,6 +12,8 @@
  */
 
 #include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
 #include <asm-xen/balloon.h>
 #include <asm/hypervisor.h>
 #include "common.h"
@@ -21,26 +23,26 @@
  * pulled from a communication ring are quite likely to end up being part of
  * the same scatter/gather request at the disc.
  * 
- * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
+ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
+ * 
  * This will increase the chances of being able to write whole tracks.
  * 64 should be enough to keep us competitive with Linux.
  */
-#define MAX_PENDING_REQS 64
-#define BATCH_PER_DOMAIN 16
-
-static unsigned long mmap_vstart;
-#define MMAP_PAGES                                             \
-       (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
-#ifdef __ia64__
-static void *pending_vaddrs[MMAP_PAGES];
-#define MMAP_VADDR(_idx, _i) \
-       (unsigned long)(pending_vaddrs[((_idx) * 
BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
-#else
-#define MMAP_VADDR(_req,_seg)                                          \
-       (mmap_vstart +                                                  \
-        ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
-        ((_seg) * PAGE_SIZE))
-#endif
+static int blkif_reqs = 64;
+static int mmap_pages;
+
+static int __init set_blkif_reqs(char *str)
+{
+       get_option(&str, &blkif_reqs);
+       return 1;
+}
+__setup("blkif_reqs=", set_blkif_reqs);
+
+/* runtime-switchable, check /sys/module/blkback/parameters/ ;) */
+static unsigned int log_stats = 0;
+static unsigned int debug_lvl = 0;
+module_param(log_stats, int, 0644);
+module_param(debug_lvl, int, 0644);
 
 /*
  * Each outstanding request that we've passed to the lower device layers has a 
@@ -55,43 +57,38 @@
        atomic_t       pendcnt;
        unsigned short operation;
        int            status;
+       struct list_head free_list;
 } pending_req_t;
 
-/*
- * We can't allocate pending_req's in order, since they may complete out of 
- * order. We therefore maintain an allocation ring. This ring also indicates 
- * when enough work has been passed down -- at that point the allocation ring 
- * will be empty.
- */
-static pending_req_t pending_reqs[MAX_PENDING_REQS];
-static unsigned char pending_ring[MAX_PENDING_REQS];
-static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
-/* NB. We use a different index type to differentiate from shared blk rings. */
-typedef unsigned int PEND_RING_IDX;
-#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
-static PEND_RING_IDX pending_prod, pending_cons;
-#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
-
-static request_queue_t *plugged_queue;
-static inline void flush_plugged_queue(void)
-{
-       request_queue_t *q = plugged_queue;
-       if (q != NULL) {
-               if ( q->unplug_fn != NULL )
-                       q->unplug_fn(q);
-               blk_put_queue(q);
-               plugged_queue = NULL;
-       }
-}
+static pending_req_t *pending_reqs;
+static struct list_head pending_free;
+static spinlock_t pending_free_lock = SPIN_LOCK_UNLOCKED;
+static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
+
+#define BLKBACK_INVALID_HANDLE (~0)
+
+static unsigned long mmap_vstart;
+static void **pending_vaddrs;
+static grant_handle_t *pending_grant_handles;
+
+static inline int vaddr_pagenr(pending_req_t *req, int seg)
+{
+       return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
+}
+
+static inline unsigned long vaddr(pending_req_t *req, int seg)
+{
+       return pending_vaddrs[vaddr_pagenr(req, seg)];
+}
+
+#define pending_handle(_req, _seg) \
+       (pending_grant_handles[vaddr_pagenr(_req, _seg)])
+
 
 /* When using grant tables to map a frame for device access then the
  * handle returned must be used to unmap the frame. This is needed to
  * drop the ref count on the frame.
  */
-static grant_handle_t pending_grant_handles[MMAP_PAGES];
-#define pending_handle(_idx, _i) \
-    (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
-#define BLKBACK_INVALID_HANDLE (~0)
 
 #ifdef CONFIG_XEN_BLKDEV_TAP_BE
 /*
@@ -105,26 +102,79 @@
 static inline domid_t ID_TO_DOM(unsigned long id) { return (id >> 16); }
 #endif
 
-static int do_block_io_op(blkif_t *blkif, int max_to_do);
-static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
+static int do_block_io_op(blkif_t *blkif);
+static void dispatch_rw_block_io(blkif_t *blkif,
+                                blkif_request_t *req,
+                                pending_req_t *pending_req);
 static void make_response(blkif_t *blkif, unsigned long id, 
                           unsigned short op, int st);
 
-static void fast_flush_area(int idx, int nr_pages)
+/******************************************************************
+ * misc small helpers
+ */
+static pending_req_t* alloc_req(void)
+{
+       pending_req_t *req = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pending_free_lock, flags);
+       if (!list_empty(&pending_free)) {
+               req = list_entry(pending_free.next, pending_req_t, free_list);
+               list_del(&req->free_list);
+       }
+       spin_unlock_irqrestore(&pending_free_lock, flags);
+       return req;
+}
+
+static void free_req(pending_req_t *req)
+{
+       unsigned long flags;
+       int was_empty;
+
+       spin_lock_irqsave(&pending_free_lock, flags);
+       was_empty = list_empty(&pending_free);
+       list_add(&req->free_list, &pending_free);
+       spin_unlock_irqrestore(&pending_free_lock, flags);
+       if (was_empty)
+               wake_up(&pending_free_wq);
+}
+
+static void unplug_queue(blkif_t *blkif)
+{
+       if (NULL == blkif->plug)
+               return;
+       if (blkif->plug->unplug_fn)
+               blkif->plug->unplug_fn(blkif->plug);
+       blk_put_queue(blkif->plug);
+       blkif->plug = NULL;
+}
+
+static void plug_queue(blkif_t *blkif, struct bio *bio)
+{
+       request_queue_t *q = bdev_get_queue(bio->bi_bdev);
+
+       if (q == blkif->plug)
+               return;
+       unplug_queue(blkif);
+       blk_get_queue(q);
+       blkif->plug = q;
+}
+
+static void fast_flush_area(pending_req_t *req)
 {
        struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
        unsigned int i, invcount = 0;
        grant_handle_t handle;
        int ret;
 
-       for (i = 0; i < nr_pages; i++) {
-               handle = pending_handle(idx, i);
+       for (i = 0; i < req->nr_pages; i++) {
+               handle = pending_handle(req, i);
                if (handle == BLKBACK_INVALID_HANDLE)
                        continue;
-               unmap[invcount].host_addr    = MMAP_VADDR(idx, i);
+               unmap[invcount].host_addr    = vaddr(req, i);
                unmap[invcount].dev_bus_addr = 0;
                unmap[invcount].handle       = handle;
-               pending_handle(idx, i) = BLKBACK_INVALID_HANDLE;
+               pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
                invcount++;
        }
 
@@ -133,109 +183,79 @@
        BUG_ON(ret);
 }
 
-
-/******************************************************************
- * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
- */
-
-static struct list_head blkio_schedule_list;
-static spinlock_t blkio_schedule_list_lock;
-
-static int __on_blkdev_list(blkif_t *blkif)
-{
-       return blkif->blkdev_list.next != NULL;
-}
-
-static void remove_from_blkdev_list(blkif_t *blkif)
-{
-       unsigned long flags;
-
-       if (!__on_blkdev_list(blkif))
-               return;
-
-       spin_lock_irqsave(&blkio_schedule_list_lock, flags);
-       if (__on_blkdev_list(blkif)) {
-               list_del(&blkif->blkdev_list);
-               blkif->blkdev_list.next = NULL;
-               blkif_put(blkif);
-       }
-       spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
-}
-
-static void add_to_blkdev_list_tail(blkif_t *blkif)
-{
-       unsigned long flags;
-
-       if (__on_blkdev_list(blkif))
-               return;
-
-       spin_lock_irqsave(&blkio_schedule_list_lock, flags);
-       if (!__on_blkdev_list(blkif) && (blkif->status == CONNECTED)) {
-               list_add_tail(&blkif->blkdev_list, &blkio_schedule_list);
-               blkif_get(blkif);
-       }
-       spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
-}
-
-
 /******************************************************************
  * SCHEDULER FUNCTIONS
  */
 
-static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait);
-
-static int blkio_schedule(void *arg)
-{
-       DECLARE_WAITQUEUE(wq, current);
-
-       blkif_t          *blkif;
-       struct list_head *ent;
-
-       daemonize("xenblkd");
-
+static void print_stats(blkif_t *blkif)
+{
+       printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d\n",
+              current->comm, blkif->st_oo_req,
+              blkif->st_rd_req, blkif->st_wr_req);
+       blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+       blkif->st_rd_req = 0;
+       blkif->st_wr_req = 0;
+       blkif->st_oo_req = 0;
+}
+
+int blkif_schedule(void *arg)
+{
+       blkif_t          *blkif = arg;
+
+       blkif_get(blkif);
+       if (debug_lvl)
+               printk(KERN_DEBUG "%s: started\n", current->comm);
        for (;;) {
-               /* Wait for work to do. */
-               add_wait_queue(&blkio_schedule_wait, &wq);
-               set_current_state(TASK_INTERRUPTIBLE);
-               if ( (NR_PENDING_REQS == MAX_PENDING_REQS) || 
-                    list_empty(&blkio_schedule_list) )
-                       schedule();
-               __set_current_state(TASK_RUNNING);
-               remove_wait_queue(&blkio_schedule_wait, &wq);
-
-               /* Queue up a batch of requests. */
-               while ((NR_PENDING_REQS < MAX_PENDING_REQS) &&
-                      !list_empty(&blkio_schedule_list)) {
-                       ent = blkio_schedule_list.next;
-                       blkif = list_entry(ent, blkif_t, blkdev_list);
-                       blkif_get(blkif);
-                       remove_from_blkdev_list(blkif);
-                       if (do_block_io_op(blkif, BATCH_PER_DOMAIN))
-                               add_to_blkdev_list_tail(blkif);
-                       blkif_put(blkif);
-               }
-
-               /* Push the batch through to disc. */
-               flush_plugged_queue();
-       }
-}
-
-static void maybe_trigger_blkio_schedule(void)
-{
-       /*
-        * Needed so that two processes, which together make the following
-        * predicate true, don't both read stale values and evaluate the
-        * predicate incorrectly. Incredibly unlikely to stall the scheduler
-        * on x86, but...
-        */
-       smp_mb();
-
-       if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
-           !list_empty(&blkio_schedule_list))
-               wake_up(&blkio_schedule_wait);
-}
-
-
+               if (kthread_should_stop()) {
+                       /* asked to quit? */
+                       if (!atomic_read(&blkif->io_pending))
+                               break;
+                       if (debug_lvl)
+                               printk(KERN_DEBUG "%s: I/O pending, delaying 
exit\n",
+                                      current->comm);
+               }
+
+               if (!atomic_read(&blkif->io_pending)) {
+                       /* Wait for work to do. */
+                       wait_event_interruptible(blkif->wq,
+                                                
atomic_read(&blkif->io_pending) ||
+                                                kthread_should_stop());
+               } else if (list_empty(&pending_free)) {
+                       /* Wait for pending_req becoming available. */
+                       wait_event_interruptible(pending_free_wq,
+                                                !list_empty(&pending_free));
+               }
+
+               if (blkif->status != CONNECTED) {
+                       /* make sure we are connected */
+                       if (debug_lvl)
+                               printk(KERN_DEBUG "%s: not connected (%d 
pending)\n",
+                                      current->comm, 
atomic_read(&blkif->io_pending));
+                       wait_event_interruptible(blkif->wq,
+                                                blkif->status != CONNECTED ||
+                                                kthread_should_stop());
+                       continue;
+               }
+
+               /* Schedule I/O */
+               atomic_set(&blkif->io_pending, 0);
+               if (do_block_io_op(blkif))
+                       atomic_inc(&blkif->io_pending);
+               unplug_queue(blkif);
+
+               if (log_stats && time_after(jiffies, blkif->st_print))
+                       print_stats(blkif);
+       }
+
+       /* bye folks, and thanks for all the fish ;) */
+       if (log_stats)
+               print_stats(blkif);
+       if (debug_lvl)
+               printk(KERN_DEBUG "%s: exiting\n", current->comm);
+       blkif->xenblkd = NULL;
+       blkif_put(blkif);
+       return 0;
+}
 
 /******************************************************************
  * COMPLETION CALLBACK -- Called as bh->b_end_io()
@@ -243,8 +263,6 @@
 
 static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
 {
-       unsigned long flags;
-
        /* An error fails the entire request. */
        if (!uptodate) {
                DPRINTK("Buffer not up-to-date at end of operation\n");
@@ -252,15 +270,11 @@
        }
 
        if (atomic_dec_and_test(&pending_req->pendcnt)) {
-               int pending_idx = pending_req - pending_reqs;
-               fast_flush_area(pending_idx, pending_req->nr_pages);
+               fast_flush_area(pending_req);
                make_response(pending_req->blkif, pending_req->id,
                              pending_req->operation, pending_req->status);
                blkif_put(pending_req->blkif);
-               spin_lock_irqsave(&pend_prod_lock, flags);
-               pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
-               spin_unlock_irqrestore(&pend_prod_lock, flags);
-               maybe_trigger_blkio_schedule();
+               free_req(pending_req);
        }
 }
 
@@ -281,8 +295,9 @@
 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
 {
        blkif_t *blkif = dev_id;
-       add_to_blkdev_list_tail(blkif);
-       maybe_trigger_blkio_schedule();
+
+       atomic_inc(&blkif->io_pending);
+       wake_up(&blkif->wq);
        return IRQ_HANDLED;
 }
 
@@ -292,10 +307,11 @@
  * DOWNWARD CALLS -- These interface with the block-device layer proper.
  */
 
-static int do_block_io_op(blkif_t *blkif, int max_to_do)
+static int do_block_io_op(blkif_t *blkif)
 {
        blkif_back_ring_t *blk_ring = &blkif->blk_ring;
        blkif_request_t *req;
+       pending_req_t *pending_req;
        RING_IDX rc, rp;
        int more_to_do = 0;
 
@@ -304,8 +320,10 @@
        rmb(); /* Ensure we see queued requests up to 'rp'. */
 
        while ((rc != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
-               if ((max_to_do-- == 0) ||
-                   (NR_PENDING_REQS == MAX_PENDING_REQS)) {
+
+               pending_req = alloc_req();
+               if (NULL == pending_req) {
+                       blkif->st_oo_req++;
                        more_to_do = 1;
                        break;
                }
@@ -315,28 +333,31 @@
 
                switch (req->operation) {
                case BLKIF_OP_READ:
+                       blkif->st_rd_req++;
+                       dispatch_rw_block_io(blkif, req, pending_req);
+                       break;
                case BLKIF_OP_WRITE:
-                       dispatch_rw_block_io(blkif, req);
+                       blkif->st_wr_req++;
+                       dispatch_rw_block_io(blkif, req, pending_req);
                        break;
-
                default:
                        DPRINTK("error: unknown block io operation [%d]\n",
                                req->operation);
                        make_response(blkif, req->id, req->operation,
                                      BLKIF_RSP_ERROR);
+                       free_req(pending_req);
                        break;
                }
        }
-
        return more_to_do;
 }
 
-static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
+static void dispatch_rw_block_io(blkif_t *blkif,
+                                blkif_request_t *req,
+                                pending_req_t *pending_req)
 {
        extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 
        int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
-       int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
-       pending_req_t *pending_req;
        struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
        struct phys_req preq;
        struct { 
@@ -344,32 +365,36 @@
        } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
        unsigned int nseg;
        struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-       int nbio = 0;
-       request_queue_t *q;
-       int ret, errors = 0;
+       int ret, i, nbio = 0;
 
        /* Check that number of segments is sane. */
        nseg = req->nr_segments;
        if (unlikely(nseg == 0) || 
            unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
                DPRINTK("Bad number of segments in request (%d)\n", nseg);
-               goto bad_descriptor;
+               goto fail_response;
        }
 
        preq.dev           = req->handle;
        preq.sector_number = req->sector_number;
        preq.nr_sects      = 0;
 
+       pending_req->blkif     = blkif;
+       pending_req->id        = req->id;
+       pending_req->operation = operation;
+       pending_req->status    = BLKIF_RSP_OKAY;
+       pending_req->nr_pages  = nseg;
+
        for (i = 0; i < nseg; i++) {
                seg[i].nsec = req->seg[i].last_sect -
                        req->seg[i].first_sect + 1;
 
                if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
                    (seg[i].nsec <= 0))
-                       goto bad_descriptor;
+                       goto fail_response;
                preq.nr_sects += seg[i].nsec;
 
-               map[i].host_addr = MMAP_VADDR(pending_idx, i);
+               map[i].host_addr = vaddr(pending_req, i);
                map[i].dom = blkif->domid;
                map[i].ref = req->seg[i].gref;
                map[i].flags = GNTMAP_host_map;
@@ -381,26 +406,22 @@
        BUG_ON(ret);
 
        for (i = 0; i < nseg; i++) {
-               if (likely(map[i].status == 0)) {
-                       pending_handle(pending_idx, i) = map[i].handle;
+               if (unlikely(map[i].status != 0)) {
+                       DPRINTK("invalid buffer -- could not remap it\n");
+                       goto fail_flush;
+               }
+
+               pending_handle(pending_req, i) = map[i].handle;
 #ifdef __ia64__
-                       MMAP_VADDR(pending_idx,i) = gnttab_map_vaddr(map[i]);
+               pending_vaddrs[vaddr_pagenr(req, seg)] =
+                       = gnttab_map_vaddr(map[i]);
 #else
-                       set_phys_to_machine(__pa(MMAP_VADDR(
-                               pending_idx, i)) >> PAGE_SHIFT,
-                               FOREIGN_FRAME(map[i].dev_bus_addr>>PAGE_SHIFT));
+               set_phys_to_machine(__pa(vaddr(
+                       pending_req, i)) >> PAGE_SHIFT,
+                       FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
 #endif
-                       seg[i].buf = map[i].dev_bus_addr |
-                               (req->seg[i].first_sect << 9);
-               } else {
-                       errors++;
-               }
-       }
-
-       if (errors) {
-               DPRINTK("invalid buffer -- could not remap it\n");
-               fast_flush_area(pending_idx, nseg);
-               goto bad_descriptor;
+               seg[i].buf  = map[i].dev_bus_addr | 
+                       (req->seg[i].first_sect << 9);
        }
 
        if (vbd_translate(&preq, blkif, operation) != 0) {
@@ -408,37 +429,25 @@
                        operation == READ ? "read" : "write",
                        preq.sector_number,
                        preq.sector_number + preq.nr_sects, preq.dev); 
-               goto bad_descriptor;
-       }
-
-       pending_req = &pending_reqs[pending_idx];
-       pending_req->blkif     = blkif;
-       pending_req->id        = req->id;
-       pending_req->operation = operation;
-       pending_req->status    = BLKIF_RSP_OKAY;
-       pending_req->nr_pages  = nseg;
+               goto fail_flush;
+       }
 
        for (i = 0; i < nseg; i++) {
                if (((int)preq.sector_number|(int)seg[i].nsec) &
                    ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) {
                        DPRINTK("Misaligned I/O request from domain %d",
                                blkif->domid);
-                       goto cleanup_and_fail;
+                       goto fail_put_bio;
                }
 
                while ((bio == NULL) ||
                       (bio_add_page(bio,
-                                    virt_to_page(MMAP_VADDR(pending_idx, i)),
+                                    virt_to_page(vaddr(pending_req, i)),
                                     seg[i].nsec << 9,
                                     seg[i].buf & ~PAGE_MASK) == 0)) {
                        bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i);
-                       if (unlikely(bio == NULL)) {
-                       cleanup_and_fail:
-                               for (i = 0; i < (nbio-1); i++)
-                                       bio_put(biolist[i]);
-                               fast_flush_area(pending_idx, nseg);
-                               goto bad_descriptor;
-                       }
+                       if (unlikely(bio == NULL))
+                               goto fail_put_bio;
                 
                        bio->bi_bdev    = preq.bdev;
                        bio->bi_private = pending_req;
@@ -449,14 +458,8 @@
                preq.sector_number += seg[i].nsec;
        }
 
-       if ((q = bdev_get_queue(bio->bi_bdev)) != plugged_queue) {
-               flush_plugged_queue();
-               blk_get_queue(q);
-               plugged_queue = q;
-       }
-
+       plug_queue(blkif, bio);
        atomic_set(&pending_req->pendcnt, nbio);
-       pending_cons++;
        blkif_get(blkif);
 
        for (i = 0; i < nbio; i++)
@@ -464,8 +467,14 @@
 
        return;
 
- bad_descriptor:
+ fail_put_bio:
+       for (i = 0; i < (nbio-1); i++)
+               bio_put(biolist[i]);
+ fail_flush:
+       fast_flush_area(pending_req);
+ fail_response:
        make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+       free_req(pending_req);
 } 
 
 
@@ -481,6 +490,7 @@
        blkif_response_t *resp;
        unsigned long     flags;
        blkif_back_ring_t *blk_ring = &blkif->blk_ring;
+       int more_to_do = 0;
        int notify;
 
        spin_lock_irqsave(&blkif->blk_ring_lock, flags);
@@ -499,76 +509,67 @@
                 * notifications if requests are already in flight (lower
                 * overheads and promotes batching).
                 */
-               int more_to_do;
                RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
-               if (more_to_do) {
-                       add_to_blkdev_list_tail(blkif);
-                       maybe_trigger_blkio_schedule();
-               }
-       }
-       else if (!__on_blkdev_list(blkif)
-                && RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
-               /* Keep pulling requests as they become available... */
-               add_to_blkdev_list_tail(blkif);
-               maybe_trigger_blkio_schedule();
-       }
-
+
+       } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
+               more_to_do = 1;
+
+       }
        spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
 
+       if (more_to_do) {
+               atomic_inc(&blkif->io_pending);
+               wake_up(&blkif->wq);
+       }
        if (notify)
                notify_remote_via_irq(blkif->irq);
 }
 
-void blkif_deschedule(blkif_t *blkif)
-{
-       remove_from_blkdev_list(blkif);
-}
-
 static int __init blkif_init(void)
 {
+       struct page *page;
        int i;
-       struct page *page;
-       int ret;
-
-       for (i = 0; i < MMAP_PAGES; i++)
-               pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
 
        if (xen_init() < 0)
                return -ENODEV;
 
+       mmap_pages            = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
+       pending_reqs          = kmalloc(sizeof(pending_reqs[0]) *
+                                       blkif_reqs, GFP_KERNEL);
+       pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
+                                       mmap_pages, GFP_KERNEL);
+       pending_vaddrs        = kmalloc(sizeof(pending_vaddrs[0]) *
+                                       mmap_pages, GFP_KERNEL);
+       if (!pending_reqs || !pending_grant_handles || !pending_vaddrs) {
+               printk("%s: out of memory\n", __FUNCTION__);
+               return -1;
+       }
+
        blkif_interface_init();
-
+       
 #ifdef __ia64__
-    {
        extern unsigned long alloc_empty_foreign_map_page_range(unsigned long 
pages);
-       int i;
-
-       mmap_vstart =  alloc_empty_foreign_map_page_range(MMAP_PAGES);
-       printk("Allocated mmap_vstart: 0x%lx\n", mmap_vstart);
-       for(i = 0; i < MMAP_PAGES; i++)
-           pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
-       BUG_ON(mmap_vstart == NULL);
-    }
-#else
-       page = balloon_alloc_empty_page_range(MMAP_PAGES);
+       mmap_vstart = (unsigned 
long)alloc_empty_foreign_map_page_range(mmap_pages);
+#else /* ! ia64 */
+       page = balloon_alloc_empty_page_range(mmap_pages);
        BUG_ON(page == NULL);
        mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
 #endif
-
-       pending_cons = 0;
-       pending_prod = MAX_PENDING_REQS;
+       printk("%s: reqs=%d, pages=%d, mmap_vstart=0x%lx\n",
+              __FUNCTION__, blkif_reqs, mmap_pages, mmap_vstart);
+       BUG_ON(mmap_vstart == 0);
+       for (i = 0; i < mmap_pages; i++) {
+               pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
+               pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
+       }
+
        memset(pending_reqs, 0, sizeof(pending_reqs));
-       for (i = 0; i < MAX_PENDING_REQS; i++)
-               pending_ring[i] = i;
+       INIT_LIST_HEAD(&pending_free);
+
+       for (i = 0; i < blkif_reqs; i++)
+               list_add_tail(&pending_reqs[i].free_list, &pending_free);
     
-       spin_lock_init(&blkio_schedule_list_lock);
-       INIT_LIST_HEAD(&blkio_schedule_list);
-
-       ret = kernel_thread(blkio_schedule, NULL, CLONE_FS | CLONE_FILES);
-       BUG_ON(ret < 0);
-
        blkif_xenbus_init();
-
        return 0;
 }
 
diff -r 0255f48b757f linux-2.6-xen-sparse/drivers/xen/blkback/common.h
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/common.h Sun Dec  4 19:12:00 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/common.h Tue Dec  6 15:29:06 2005
@@ -60,9 +60,19 @@
        /* Is this a blktap frontend */
        unsigned int     is_blktap;
 #endif
-       struct list_head blkdev_list;
        spinlock_t       blk_ring_lock;
        atomic_t         refcnt;
+
+       wait_queue_head_t   wq;
+       struct task_struct  *xenblkd;
+       atomic_t            io_pending;
+       request_queue_t     *plug;
+
+       /* statistics */
+       unsigned long       st_print;
+       int                 st_rd_req;
+       int                 st_wr_req;
+       int                 st_oo_req;
 
        struct work_struct free_work;
 
@@ -101,11 +111,10 @@
 
 void blkif_interface_init(void);
 
-void blkif_deschedule(blkif_t *blkif);
-
 void blkif_xenbus_init(void);
 
 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+int blkif_schedule(void *arg);
 
 void update_blkif_status(blkif_t *blkif); 
 
diff -r 0255f48b757f linux-2.6-xen-sparse/drivers/xen/blkback/interface.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c      Sun Dec  4 
19:12:00 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c      Tue Dec  6 
15:29:06 2005
@@ -24,6 +24,8 @@
        blkif->status = DISCONNECTED;
        spin_lock_init(&blkif->blk_ring_lock);
        atomic_set(&blkif->refcnt, 1);
+       init_waitqueue_head(&blkif->wq);
+       blkif->st_print = jiffies;
 
        return blkif;
 }
@@ -111,6 +113,7 @@
 
        blkif->irq = bind_evtchn_to_irqhandler(
                blkif->evtchn, blkif_be_int, 0, "blkif-backend", blkif);
+       wake_up(&blkif->wq);
 
        /* We're potentially connected now */
        update_blkif_status(blkif); 
diff -r 0255f48b757f linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c Sun Dec  4 19:12:00 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c Tue Dec  6 15:29:06 2005
@@ -20,6 +20,7 @@
 
 #include <stdarg.h>
 #include <linux/module.h>
+#include <linux/kthread.h>
 #include <asm-xen/xenbus.h>
 #include "common.h"
 
@@ -92,6 +93,8 @@
        }
        if (be->blkif) {
                be->blkif->status = DISCONNECTED; 
+               if (be->blkif->xenblkd)
+                       kthread_stop(be->blkif->xenblkd);
                blkif_put(be->blkif);
                be->blkif = NULL;
        }
@@ -217,6 +220,17 @@
                        be->major = 0;
                        be->minor = 0;
                        xenbus_dev_fatal(dev, err, "creating vbd structure");
+                       return;
+               }
+
+               be->blkif->xenblkd = kthread_run(blkif_schedule, be->blkif,
+                                                "xvd %d %02x:%02x",
+                                                be->blkif->domid,
+                                                be->major, be->minor);
+               if (IS_ERR(be->blkif->xenblkd)) {
+                       err = PTR_ERR(be->blkif->xenblkd);
+                       be->blkif->xenblkd = NULL;
+                       xenbus_dev_error(dev, err, "start xenblkd");
                        return;
                }

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

WARNING - OLD ARCHIVES

xen-devel

[Xen-devel] Re: [patch] CFQ for xen domains