WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] Re: [patch] CFQ for xen domains

Gerd Knorr wrote:
  Hi folks,

New version of the patch, adapted to apply cleanly against latest

One more version, this time against 3.0-final ;)

cheers,

  Gerd

diff -r 0255f48b757f linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c        Sun Dec  4 
19:12:00 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c        Tue Dec  6 
15:29:06 2005
@@ -12,6 +12,8 @@
  */
 
 #include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
 #include <asm-xen/balloon.h>
 #include <asm/hypervisor.h>
 #include "common.h"
@@ -21,26 +23,26 @@
  * pulled from a communication ring are quite likely to end up being part of
  * the same scatter/gather request at the disc.
  * 
- * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
+ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
+ * 
  * This will increase the chances of being able to write whole tracks.
  * 64 should be enough to keep us competitive with Linux.
  */
-#define MAX_PENDING_REQS 64
-#define BATCH_PER_DOMAIN 16
-
-static unsigned long mmap_vstart;
-#define MMAP_PAGES                                             \
-       (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
-#ifdef __ia64__
-static void *pending_vaddrs[MMAP_PAGES];
-#define MMAP_VADDR(_idx, _i) \
-       (unsigned long)(pending_vaddrs[((_idx) * 
BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
-#else
-#define MMAP_VADDR(_req,_seg)                                          \
-       (mmap_vstart +                                                  \
-        ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
-        ((_seg) * PAGE_SIZE))
-#endif
+static int blkif_reqs = 64;
+static int mmap_pages;
+
+static int __init set_blkif_reqs(char *str)
+{
+       get_option(&str, &blkif_reqs);
+       return 1;
+}
+__setup("blkif_reqs=", set_blkif_reqs);
+
+/* runtime-switchable, check /sys/module/blkback/parameters/ ;) */
+static unsigned int log_stats = 0;
+static unsigned int debug_lvl = 0;
+module_param(log_stats, int, 0644);
+module_param(debug_lvl, int, 0644);
 
 /*
  * Each outstanding request that we've passed to the lower device layers has a 
@@ -55,43 +57,38 @@
        atomic_t       pendcnt;
        unsigned short operation;
        int            status;
+       struct list_head free_list;
 } pending_req_t;
 
-/*
- * We can't allocate pending_req's in order, since they may complete out of 
- * order. We therefore maintain an allocation ring. This ring also indicates 
- * when enough work has been passed down -- at that point the allocation ring 
- * will be empty.
- */
-static pending_req_t pending_reqs[MAX_PENDING_REQS];
-static unsigned char pending_ring[MAX_PENDING_REQS];
-static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
-/* NB. We use a different index type to differentiate from shared blk rings. */
-typedef unsigned int PEND_RING_IDX;
-#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
-static PEND_RING_IDX pending_prod, pending_cons;
-#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
-
-static request_queue_t *plugged_queue;
-static inline void flush_plugged_queue(void)
-{
-       request_queue_t *q = plugged_queue;
-       if (q != NULL) {
-               if ( q->unplug_fn != NULL )
-                       q->unplug_fn(q);
-               blk_put_queue(q);
-               plugged_queue = NULL;
-       }
-}
+static pending_req_t *pending_reqs;
+static struct list_head pending_free;
+static spinlock_t pending_free_lock = SPIN_LOCK_UNLOCKED;
+static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
+
+#define BLKBACK_INVALID_HANDLE (~0)
+
+static unsigned long mmap_vstart;
+static void **pending_vaddrs;
+static grant_handle_t *pending_grant_handles;
+
+static inline int vaddr_pagenr(pending_req_t *req, int seg)
+{
+       return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
+}
+
+static inline unsigned long vaddr(pending_req_t *req, int seg)
+{
+       return pending_vaddrs[vaddr_pagenr(req, seg)];
+}
+
+#define pending_handle(_req, _seg) \
+       (pending_grant_handles[vaddr_pagenr(_req, _seg)])
+
 
 /* When using grant tables to map a frame for device access then the
  * handle returned must be used to unmap the frame. This is needed to
  * drop the ref count on the frame.
  */
-static grant_handle_t pending_grant_handles[MMAP_PAGES];
-#define pending_handle(_idx, _i) \
-    (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
-#define BLKBACK_INVALID_HANDLE (~0)
 
 #ifdef CONFIG_XEN_BLKDEV_TAP_BE
 /*
@@ -105,26 +102,79 @@
 static inline domid_t ID_TO_DOM(unsigned long id) { return (id >> 16); }
 #endif
 
-static int do_block_io_op(blkif_t *blkif, int max_to_do);
-static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
+static int do_block_io_op(blkif_t *blkif);
+static void dispatch_rw_block_io(blkif_t *blkif,
+                                blkif_request_t *req,
+                                pending_req_t *pending_req);
 static void make_response(blkif_t *blkif, unsigned long id, 
                           unsigned short op, int st);
 
-static void fast_flush_area(int idx, int nr_pages)
+/******************************************************************
+ * misc small helpers
+ */
+static pending_req_t* alloc_req(void)
+{
+       pending_req_t *req = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pending_free_lock, flags);
+       if (!list_empty(&pending_free)) {
+               req = list_entry(pending_free.next, pending_req_t, free_list);
+               list_del(&req->free_list);
+       }
+       spin_unlock_irqrestore(&pending_free_lock, flags);
+       return req;
+}
+
+static void free_req(pending_req_t *req)
+{
+       unsigned long flags;
+       int was_empty;
+
+       spin_lock_irqsave(&pending_free_lock, flags);
+       was_empty = list_empty(&pending_free);
+       list_add(&req->free_list, &pending_free);
+       spin_unlock_irqrestore(&pending_free_lock, flags);
+       if (was_empty)
+               wake_up(&pending_free_wq);
+}
+
+static void unplug_queue(blkif_t *blkif)
+{
+       if (NULL == blkif->plug)
+               return;
+       if (blkif->plug->unplug_fn)
+               blkif->plug->unplug_fn(blkif->plug);
+       blk_put_queue(blkif->plug);
+       blkif->plug = NULL;
+}
+
+static void plug_queue(blkif_t *blkif, struct bio *bio)
+{
+       request_queue_t *q = bdev_get_queue(bio->bi_bdev);
+
+       if (q == blkif->plug)
+               return;
+       unplug_queue(blkif);
+       blk_get_queue(q);
+       blkif->plug = q;
+}
+
+static void fast_flush_area(pending_req_t *req)
 {
        struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
        unsigned int i, invcount = 0;
        grant_handle_t handle;
        int ret;
 
-       for (i = 0; i < nr_pages; i++) {
-               handle = pending_handle(idx, i);
+       for (i = 0; i < req->nr_pages; i++) {
+               handle = pending_handle(req, i);
                if (handle == BLKBACK_INVALID_HANDLE)
                        continue;
-               unmap[invcount].host_addr    = MMAP_VADDR(idx, i);
+               unmap[invcount].host_addr    = vaddr(req, i);
                unmap[invcount].dev_bus_addr = 0;
                unmap[invcount].handle       = handle;
-               pending_handle(idx, i) = BLKBACK_INVALID_HANDLE;
+               pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
                invcount++;
        }
 
@@ -133,109 +183,79 @@
        BUG_ON(ret);
 }
 
-
-/******************************************************************
- * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
- */
-
-static struct list_head blkio_schedule_list;
-static spinlock_t blkio_schedule_list_lock;
-
-static int __on_blkdev_list(blkif_t *blkif)
-{
-       return blkif->blkdev_list.next != NULL;
-}
-
-static void remove_from_blkdev_list(blkif_t *blkif)
-{
-       unsigned long flags;
-
-       if (!__on_blkdev_list(blkif))
-               return;
-
-       spin_lock_irqsave(&blkio_schedule_list_lock, flags);
-       if (__on_blkdev_list(blkif)) {
-               list_del(&blkif->blkdev_list);
-               blkif->blkdev_list.next = NULL;
-               blkif_put(blkif);
-       }
-       spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
-}
-
-static void add_to_blkdev_list_tail(blkif_t *blkif)
-{
-       unsigned long flags;
-
-       if (__on_blkdev_list(blkif))
-               return;
-
-       spin_lock_irqsave(&blkio_schedule_list_lock, flags);
-       if (!__on_blkdev_list(blkif) && (blkif->status == CONNECTED)) {
-               list_add_tail(&blkif->blkdev_list, &blkio_schedule_list);
-               blkif_get(blkif);
-       }
-       spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
-}
-
-
 /******************************************************************
  * SCHEDULER FUNCTIONS
  */
 
-static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait);
-
-static int blkio_schedule(void *arg)
-{
-       DECLARE_WAITQUEUE(wq, current);
-
-       blkif_t          *blkif;
-       struct list_head *ent;
-
-       daemonize("xenblkd");
-
+static void print_stats(blkif_t *blkif)
+{
+       printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d\n",
+              current->comm, blkif->st_oo_req,
+              blkif->st_rd_req, blkif->st_wr_req);
+       blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+       blkif->st_rd_req = 0;
+       blkif->st_wr_req = 0;
+       blkif->st_oo_req = 0;
+}
+
+int blkif_schedule(void *arg)
+{
+       blkif_t          *blkif = arg;
+
+       blkif_get(blkif);
+       if (debug_lvl)
+               printk(KERN_DEBUG "%s: started\n", current->comm);
        for (;;) {
-               /* Wait for work to do. */
-               add_wait_queue(&blkio_schedule_wait, &wq);
-               set_current_state(TASK_INTERRUPTIBLE);
-               if ( (NR_PENDING_REQS == MAX_PENDING_REQS) || 
-                    list_empty(&blkio_schedule_list) )
-                       schedule();
-               __set_current_state(TASK_RUNNING);
-               remove_wait_queue(&blkio_schedule_wait, &wq);
-
-               /* Queue up a batch of requests. */
-               while ((NR_PENDING_REQS < MAX_PENDING_REQS) &&
-                      !list_empty(&blkio_schedule_list)) {
-                       ent = blkio_schedule_list.next;
-                       blkif = list_entry(ent, blkif_t, blkdev_list);
-                       blkif_get(blkif);
-                       remove_from_blkdev_list(blkif);
-                       if (do_block_io_op(blkif, BATCH_PER_DOMAIN))
-                               add_to_blkdev_list_tail(blkif);
-                       blkif_put(blkif);
-               }
-
-               /* Push the batch through to disc. */
-               flush_plugged_queue();
-       }
-}
-
-static void maybe_trigger_blkio_schedule(void)
-{
-       /*
-        * Needed so that two processes, which together make the following
-        * predicate true, don't both read stale values and evaluate the
-        * predicate incorrectly. Incredibly unlikely to stall the scheduler
-        * on x86, but...
-        */
-       smp_mb();
-
-       if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
-           !list_empty(&blkio_schedule_list))
-               wake_up(&blkio_schedule_wait);
-}
-
-
+               if (kthread_should_stop()) {
+                       /* asked to quit? */
+                       if (!atomic_read(&blkif->io_pending))
+                               break;
+                       if (debug_lvl)
+                               printk(KERN_DEBUG "%s: I/O pending, delaying 
exit\n",
+                                      current->comm);
+               }
+
+               if (!atomic_read(&blkif->io_pending)) {
+                       /* Wait for work to do. */
+                       wait_event_interruptible(blkif->wq,
+                                                
atomic_read(&blkif->io_pending) ||
+                                                kthread_should_stop());
+               } else if (list_empty(&pending_free)) {
+                       /* Wait for pending_req becoming available. */
+                       wait_event_interruptible(pending_free_wq,
+                                                !list_empty(&pending_free));
+               }
+
+               if (blkif->status != CONNECTED) {
+                       /* make sure we are connected */
+                       if (debug_lvl)
+                               printk(KERN_DEBUG "%s: not connected (%d 
pending)\n",
+                                      current->comm, 
atomic_read(&blkif->io_pending));
+                       wait_event_interruptible(blkif->wq,
+                                                blkif->status != CONNECTED ||
+                                                kthread_should_stop());
+                       continue;
+               }
+
+               /* Schedule I/O */
+               atomic_set(&blkif->io_pending, 0);
+               if (do_block_io_op(blkif))
+                       atomic_inc(&blkif->io_pending);
+               unplug_queue(blkif);
+
+               if (log_stats && time_after(jiffies, blkif->st_print))
+                       print_stats(blkif);
+       }
+
+       /* bye folks, and thanks for all the fish ;) */
+       if (log_stats)
+               print_stats(blkif);
+       if (debug_lvl)
+               printk(KERN_DEBUG "%s: exiting\n", current->comm);
+       blkif->xenblkd = NULL;
+       blkif_put(blkif);
+       return 0;
+}
 
 /******************************************************************
  * COMPLETION CALLBACK -- Called as bh->b_end_io()
@@ -243,8 +263,6 @@
 
 static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
 {
-       unsigned long flags;
-
        /* An error fails the entire request. */
        if (!uptodate) {
                DPRINTK("Buffer not up-to-date at end of operation\n");
@@ -252,15 +270,11 @@
        }
 
        if (atomic_dec_and_test(&pending_req->pendcnt)) {
-               int pending_idx = pending_req - pending_reqs;
-               fast_flush_area(pending_idx, pending_req->nr_pages);
+               fast_flush_area(pending_req);
                make_response(pending_req->blkif, pending_req->id,
                              pending_req->operation, pending_req->status);
                blkif_put(pending_req->blkif);
-               spin_lock_irqsave(&pend_prod_lock, flags);
-               pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
-               spin_unlock_irqrestore(&pend_prod_lock, flags);
-               maybe_trigger_blkio_schedule();
+               free_req(pending_req);
        }
 }
 
@@ -281,8 +295,9 @@
 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
 {
        blkif_t *blkif = dev_id;
-       add_to_blkdev_list_tail(blkif);
-       maybe_trigger_blkio_schedule();
+
+       atomic_inc(&blkif->io_pending);
+       wake_up(&blkif->wq);
        return IRQ_HANDLED;
 }
 
@@ -292,10 +307,11 @@
  * DOWNWARD CALLS -- These interface with the block-device layer proper.
  */
 
-static int do_block_io_op(blkif_t *blkif, int max_to_do)
+static int do_block_io_op(blkif_t *blkif)
 {
        blkif_back_ring_t *blk_ring = &blkif->blk_ring;
        blkif_request_t *req;
+       pending_req_t *pending_req;
        RING_IDX rc, rp;
        int more_to_do = 0;
 
@@ -304,8 +320,10 @@
        rmb(); /* Ensure we see queued requests up to 'rp'. */
 
        while ((rc != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
-               if ((max_to_do-- == 0) ||
-                   (NR_PENDING_REQS == MAX_PENDING_REQS)) {
+
+               pending_req = alloc_req();
+               if (NULL == pending_req) {
+                       blkif->st_oo_req++;
                        more_to_do = 1;
                        break;
                }
@@ -315,28 +333,31 @@
 
                switch (req->operation) {
                case BLKIF_OP_READ:
+                       blkif->st_rd_req++;
+                       dispatch_rw_block_io(blkif, req, pending_req);
+                       break;
                case BLKIF_OP_WRITE:
-                       dispatch_rw_block_io(blkif, req);
+                       blkif->st_wr_req++;
+                       dispatch_rw_block_io(blkif, req, pending_req);
                        break;
-
                default:
                        DPRINTK("error: unknown block io operation [%d]\n",
                                req->operation);
                        make_response(blkif, req->id, req->operation,
                                      BLKIF_RSP_ERROR);
+                       free_req(pending_req);
                        break;
                }
        }
-
        return more_to_do;
 }
 
-static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
+static void dispatch_rw_block_io(blkif_t *blkif,
+                                blkif_request_t *req,
+                                pending_req_t *pending_req)
 {
        extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 
        int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
-       int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
-       pending_req_t *pending_req;
        struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
        struct phys_req preq;
        struct { 
@@ -344,32 +365,36 @@
        } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
        unsigned int nseg;
        struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-       int nbio = 0;
-       request_queue_t *q;
-       int ret, errors = 0;
+       int ret, i, nbio = 0;
 
        /* Check that number of segments is sane. */
        nseg = req->nr_segments;
        if (unlikely(nseg == 0) || 
            unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
                DPRINTK("Bad number of segments in request (%d)\n", nseg);
-               goto bad_descriptor;
+               goto fail_response;
        }
 
        preq.dev           = req->handle;
        preq.sector_number = req->sector_number;
        preq.nr_sects      = 0;
 
+       pending_req->blkif     = blkif;
+       pending_req->id        = req->id;
+       pending_req->operation = operation;
+       pending_req->status    = BLKIF_RSP_OKAY;
+       pending_req->nr_pages  = nseg;
+
        for (i = 0; i < nseg; i++) {
                seg[i].nsec = req->seg[i].last_sect -
                        req->seg[i].first_sect + 1;
 
                if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
                    (seg[i].nsec <= 0))
-                       goto bad_descriptor;
+                       goto fail_response;
                preq.nr_sects += seg[i].nsec;
 
-               map[i].host_addr = MMAP_VADDR(pending_idx, i);
+               map[i].host_addr = vaddr(pending_req, i);
                map[i].dom = blkif->domid;
                map[i].ref = req->seg[i].gref;
                map[i].flags = GNTMAP_host_map;
@@ -381,26 +406,22 @@
        BUG_ON(ret);
 
        for (i = 0; i < nseg; i++) {
-               if (likely(map[i].status == 0)) {
-                       pending_handle(pending_idx, i) = map[i].handle;
+               if (unlikely(map[i].status != 0)) {
+                       DPRINTK("invalid buffer -- could not remap it\n");
+                       goto fail_flush;
+               }
+
+               pending_handle(pending_req, i) = map[i].handle;
 #ifdef __ia64__
-                       MMAP_VADDR(pending_idx,i) = gnttab_map_vaddr(map[i]);
+               pending_vaddrs[vaddr_pagenr(req, seg)] =
+                       = gnttab_map_vaddr(map[i]);
 #else
-                       set_phys_to_machine(__pa(MMAP_VADDR(
-                               pending_idx, i)) >> PAGE_SHIFT,
-                               FOREIGN_FRAME(map[i].dev_bus_addr>>PAGE_SHIFT));
+               set_phys_to_machine(__pa(vaddr(
+                       pending_req, i)) >> PAGE_SHIFT,
+                       FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
 #endif
-                       seg[i].buf = map[i].dev_bus_addr |
-                               (req->seg[i].first_sect << 9);
-               } else {
-                       errors++;
-               }
-       }
-
-       if (errors) {
-               DPRINTK("invalid buffer -- could not remap it\n");
-               fast_flush_area(pending_idx, nseg);
-               goto bad_descriptor;
+               seg[i].buf  = map[i].dev_bus_addr | 
+                       (req->seg[i].first_sect << 9);
        }
 
        if (vbd_translate(&preq, blkif, operation) != 0) {
@@ -408,37 +429,25 @@
                        operation == READ ? "read" : "write",
                        preq.sector_number,
                        preq.sector_number + preq.nr_sects, preq.dev); 
-               goto bad_descriptor;
-       }
-
-       pending_req = &pending_reqs[pending_idx];
-       pending_req->blkif     = blkif;
-       pending_req->id        = req->id;
-       pending_req->operation = operation;
-       pending_req->status    = BLKIF_RSP_OKAY;
-       pending_req->nr_pages  = nseg;
+               goto fail_flush;
+       }
 
        for (i = 0; i < nseg; i++) {
                if (((int)preq.sector_number|(int)seg[i].nsec) &
                    ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) {
                        DPRINTK("Misaligned I/O request from domain %d",
                                blkif->domid);
-                       goto cleanup_and_fail;
+                       goto fail_put_bio;
                }
 
                while ((bio == NULL) ||
                       (bio_add_page(bio,
-                                    virt_to_page(MMAP_VADDR(pending_idx, i)),
+                                    virt_to_page(vaddr(pending_req, i)),
                                     seg[i].nsec << 9,
                                     seg[i].buf & ~PAGE_MASK) == 0)) {
                        bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i);
-                       if (unlikely(bio == NULL)) {
-                       cleanup_and_fail:
-                               for (i = 0; i < (nbio-1); i++)
-                                       bio_put(biolist[i]);
-                               fast_flush_area(pending_idx, nseg);
-                               goto bad_descriptor;
-                       }
+                       if (unlikely(bio == NULL))
+                               goto fail_put_bio;
                 
                        bio->bi_bdev    = preq.bdev;
                        bio->bi_private = pending_req;
@@ -449,14 +458,8 @@
                preq.sector_number += seg[i].nsec;
        }
 
-       if ((q = bdev_get_queue(bio->bi_bdev)) != plugged_queue) {
-               flush_plugged_queue();
-               blk_get_queue(q);
-               plugged_queue = q;
-       }
-
+       plug_queue(blkif, bio);
        atomic_set(&pending_req->pendcnt, nbio);
-       pending_cons++;
        blkif_get(blkif);
 
        for (i = 0; i < nbio; i++)
@@ -464,8 +467,14 @@
 
        return;
 
- bad_descriptor:
+ fail_put_bio:
+       for (i = 0; i < (nbio-1); i++)
+               bio_put(biolist[i]);
+ fail_flush:
+       fast_flush_area(pending_req);
+ fail_response:
        make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+       free_req(pending_req);
 } 
 
 
@@ -481,6 +490,7 @@
        blkif_response_t *resp;
        unsigned long     flags;
        blkif_back_ring_t *blk_ring = &blkif->blk_ring;
+       int more_to_do = 0;
        int notify;
 
        spin_lock_irqsave(&blkif->blk_ring_lock, flags);
@@ -499,76 +509,67 @@
                 * notifications if requests are already in flight (lower
                 * overheads and promotes batching).
                 */
-               int more_to_do;
                RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
-               if (more_to_do) {
-                       add_to_blkdev_list_tail(blkif);
-                       maybe_trigger_blkio_schedule();
-               }
-       }
-       else if (!__on_blkdev_list(blkif)
-                && RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
-               /* Keep pulling requests as they become available... */
-               add_to_blkdev_list_tail(blkif);
-               maybe_trigger_blkio_schedule();
-       }
-
+
+       } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
+               more_to_do = 1;
+
+       }
        spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
 
+       if (more_to_do) {
+               atomic_inc(&blkif->io_pending);
+               wake_up(&blkif->wq);
+       }
        if (notify)
                notify_remote_via_irq(blkif->irq);
 }
 
-void blkif_deschedule(blkif_t *blkif)
-{
-       remove_from_blkdev_list(blkif);
-}
-
 static int __init blkif_init(void)
 {
+       struct page *page;
        int i;
-       struct page *page;
-       int ret;
-
-       for (i = 0; i < MMAP_PAGES; i++)
-               pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
 
        if (xen_init() < 0)
                return -ENODEV;
 
+       mmap_pages            = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
+       pending_reqs          = kmalloc(sizeof(pending_reqs[0]) *
+                                       blkif_reqs, GFP_KERNEL);
+       pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
+                                       mmap_pages, GFP_KERNEL);
+       pending_vaddrs        = kmalloc(sizeof(pending_vaddrs[0]) *
+                                       mmap_pages, GFP_KERNEL);
+       if (!pending_reqs || !pending_grant_handles || !pending_vaddrs) {
+               printk("%s: out of memory\n", __FUNCTION__);
+               return -1;
+       }
+
        blkif_interface_init();
-
+       
 #ifdef __ia64__
-    {
        extern unsigned long alloc_empty_foreign_map_page_range(unsigned long 
pages);
-       int i;
-
-       mmap_vstart =  alloc_empty_foreign_map_page_range(MMAP_PAGES);
-       printk("Allocated mmap_vstart: 0x%lx\n", mmap_vstart);
-       for(i = 0; i < MMAP_PAGES; i++)
-           pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
-       BUG_ON(mmap_vstart == NULL);
-    }
-#else
-       page = balloon_alloc_empty_page_range(MMAP_PAGES);
+       mmap_vstart = (unsigned 
long)alloc_empty_foreign_map_page_range(mmap_pages);
+#else /* ! ia64 */
+       page = balloon_alloc_empty_page_range(mmap_pages);
        BUG_ON(page == NULL);
        mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
 #endif
-
-       pending_cons = 0;
-       pending_prod = MAX_PENDING_REQS;
+       printk("%s: reqs=%d, pages=%d, mmap_vstart=0x%lx\n",
+              __FUNCTION__, blkif_reqs, mmap_pages, mmap_vstart);
+       BUG_ON(mmap_vstart == 0);
+       for (i = 0; i < mmap_pages; i++) {
+               pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
+               pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
+       }
+
        memset(pending_reqs, 0, sizeof(pending_reqs));
-       for (i = 0; i < MAX_PENDING_REQS; i++)
-               pending_ring[i] = i;
+       INIT_LIST_HEAD(&pending_free);
+
+       for (i = 0; i < blkif_reqs; i++)
+               list_add_tail(&pending_reqs[i].free_list, &pending_free);
     
-       spin_lock_init(&blkio_schedule_list_lock);
-       INIT_LIST_HEAD(&blkio_schedule_list);
-
-       ret = kernel_thread(blkio_schedule, NULL, CLONE_FS | CLONE_FILES);
-       BUG_ON(ret < 0);
-
        blkif_xenbus_init();
-
        return 0;
 }
 
diff -r 0255f48b757f linux-2.6-xen-sparse/drivers/xen/blkback/common.h
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/common.h Sun Dec  4 19:12:00 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/common.h Tue Dec  6 15:29:06 2005
@@ -60,9 +60,19 @@
        /* Is this a blktap frontend */
        unsigned int     is_blktap;
 #endif
-       struct list_head blkdev_list;
        spinlock_t       blk_ring_lock;
        atomic_t         refcnt;
+
+       wait_queue_head_t   wq;
+       struct task_struct  *xenblkd;
+       atomic_t            io_pending;
+       request_queue_t     *plug;
+
+       /* statistics */
+       unsigned long       st_print;
+       int                 st_rd_req;
+       int                 st_wr_req;
+       int                 st_oo_req;
 
        struct work_struct free_work;
 
@@ -101,11 +111,10 @@
 
 void blkif_interface_init(void);
 
-void blkif_deschedule(blkif_t *blkif);
-
 void blkif_xenbus_init(void);
 
 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+int blkif_schedule(void *arg);
 
 void update_blkif_status(blkif_t *blkif); 
 
diff -r 0255f48b757f linux-2.6-xen-sparse/drivers/xen/blkback/interface.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c      Sun Dec  4 
19:12:00 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c      Tue Dec  6 
15:29:06 2005
@@ -24,6 +24,8 @@
        blkif->status = DISCONNECTED;
        spin_lock_init(&blkif->blk_ring_lock);
        atomic_set(&blkif->refcnt, 1);
+       init_waitqueue_head(&blkif->wq);
+       blkif->st_print = jiffies;
 
        return blkif;
 }
@@ -111,6 +113,7 @@
 
        blkif->irq = bind_evtchn_to_irqhandler(
                blkif->evtchn, blkif_be_int, 0, "blkif-backend", blkif);
+       wake_up(&blkif->wq);
 
        /* We're potentially connected now */
        update_blkif_status(blkif); 
diff -r 0255f48b757f linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c Sun Dec  4 19:12:00 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c Tue Dec  6 15:29:06 2005
@@ -20,6 +20,7 @@
 
 #include <stdarg.h>
 #include <linux/module.h>
+#include <linux/kthread.h>
 #include <asm-xen/xenbus.h>
 #include "common.h"
 
@@ -92,6 +93,8 @@
        }
        if (be->blkif) {
                be->blkif->status = DISCONNECTED; 
+               if (be->blkif->xenblkd)
+                       kthread_stop(be->blkif->xenblkd);
                blkif_put(be->blkif);
                be->blkif = NULL;
        }
@@ -217,6 +220,17 @@
                        be->major = 0;
                        be->minor = 0;
                        xenbus_dev_fatal(dev, err, "creating vbd structure");
+                       return;
+               }
+
+               be->blkif->xenblkd = kthread_run(blkif_schedule, be->blkif,
+                                                "xvd %d %02x:%02x",
+                                                be->blkif->domid,
+                                                be->major, be->minor);
+               if (IS_ERR(be->blkif->xenblkd)) {
+                       err = PTR_ERR(be->blkif->xenblkd);
+                       be->blkif->xenblkd = NULL;
+                       xenbus_dev_error(dev, err, "start xenblkd");
                        return;
                }
 
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-devel] Re: [patch] CFQ for xen domains, Gerd Knorr <=