[Xen-ia64-devel] Re: [Xen-devel] [patch] CFQ for xen domains

Thought I should point out this proposed blkback patch
as it is fairly extensive and may cause problems for
Xen/ia64 multiple domains.  Perhaps Kevin or Matt can
look it over before it gets applied?  On quick glance I see that
the removal of xen_init() is probably a mistake.

http://lists.xensource.com/archives/html/xen-devel/2005-11/msg00371.html

====

> Does 'xm save / xm restore' work with this patch (at least as well as
it 
> currently does ;-)? 

xm save/restore still doesn't work for me, neither with the sparse tree 
nor with the linux-2.6 repository, so I can't try.  I can't see any 
reason why it should become worse with that patch though. 

I've resynced the blkback threading patch with the latest sparse tree, 
here we are.  Changes: 

   * One thread per blkif.  The I/O scheduler can do a better job that 
     way, also you can use ionice on the blkback threads to adjust the 
     block I/O priorities for the domain. 
   * Various stuff has been moved from global variables into blkif_t. 
   * The scary allocation ring for pending_req's is gone and has been 
     replaced by a free list. 
   * made dispatch_rw_block_io() reentrant. 
   * general linux coding style cleanup, at least for the code I've 
     touched anyway. 
   * number of outstanding requests is runtime-configurable now. 
   * made the ia64 #ifdefs smaller and dropped one.  It should still 
     work on ia64 in theory, but would be great if the ia64 folks 
     can have a look ... 

cheers, 

   Gerd 

diff -r abbe3df33774 linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c 
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c Tue Nov  8
17:39:58 2005 
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c Wed Nov  9
13:45:37 2005 
@@ -12,6 +12,8 @@ 
  */ 
  
 #include <linux/spinlock.h> 
+#include <linux/kthread.h> 
+#include <linux/list.h> 
 #include <asm-xen/balloon.h> 
 #include <asm/hypervisor.h> 
 #include "common.h" 
@@ -21,26 +23,21 @@ 
  * pulled from a communication ring are quite likely to end up being
part of 
  * the same scatter/gather request at the disc. 
  * 
- * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **

+ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW ** 
+ * 
  * This will increase the chances of being able to write whole tracks. 
  * 64 should be enough to keep us competitive with Linux. 
  */ 
-#define MAX_PENDING_REQS 64 
-#define BATCH_PER_DOMAIN 16 
- 
-static unsigned long mmap_vstart; 
-#define MMAP_PAGES \ 
- (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) 
-#ifdef __ia64__ 
-static void *pending_vaddrs[MMAP_PAGES]; 
-#define MMAP_VADDR(_idx, _i) \ 
- (unsigned long)(pending_vaddrs[((_idx) *
BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)]) 
-#else 
-#define MMAP_VADDR(_req,_seg) \ 
- (mmap_vstart + \ 
- ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \ 
- ((_seg) * PAGE_SIZE)) 
-#endif 
+static int blkif_reqs = 64; 
+static int mmap_pages; 
+ 
+static int __init set_blkif_reqs(char *str) 
+{ 
+ get_option(&str, &blkif_reqs); 
+ return 1; 
+} 
+__setup("blkif_reqs=", set_blkif_reqs); 
+ 
  
 /* 
  * Each outstanding request that we've passed to the lower device
layers has a 
@@ -55,43 +52,38 @@ 
         atomic_t       pendcnt; 
         unsigned short operation; 
         int            status; 
+ struct list_head free_list; 
 } pending_req_t; 
  
-/* 
- * We can't allocate pending_req's in order, since they may complete
out of 
- * order. We therefore maintain an allocation ring. This ring also
indicates 
- * when enough work has been passed down -- at that point the
allocation ring 
- * will be empty. 
- */ 
-static pending_req_t pending_reqs[MAX_PENDING_REQS]; 
-static unsigned char pending_ring[MAX_PENDING_REQS]; 
-static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED; 
-/* NB. We use a different index type to differentiate from shared blk
rings. */ 
-typedef unsigned int PEND_RING_IDX; 
-#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1)) 
-static PEND_RING_IDX pending_prod, pending_cons; 
-#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod +
pending_cons) 
- 
-static request_queue_t *plugged_queue; 
-static inline void flush_plugged_queue(void) 
-{ 
- request_queue_t *q = plugged_queue; 
- if (q != NULL) { 
- if ( q->unplug_fn != NULL ) 
- q->unplug_fn(q); 
- blk_put_queue(q); 
- plugged_queue = NULL; 
- } 
-} 
+static pending_req_t *pending_reqs; 
+static struct list_head pending_free; 
+static spinlock_t pending_free_lock = SPIN_LOCK_UNLOCKED; 
+static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq); 
+ 
+#define BLKBACK_INVALID_HANDLE (0xFFFF) 
+ 
+static unsigned long mmap_vstart; 
+static unsigned long *pending_vaddrs; 
+static u16 *pending_grant_handles; 
+ 
+static inline int vaddr_pagenr(pending_req_t *req, int seg) 
+{ 
+ return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg; 
+} 
+ 
+static inline unsigned long vaddr(pending_req_t *req, int seg) 
+{ 
+ return pending_vaddrs[vaddr_pagenr(req, seg)]; 
+} 
+ 
+#define pending_handle(_req, _seg) \ 
+ (pending_grant_handles[vaddr_pagenr(_req, _seg)]) 
+ 
  
 /* When using grant tables to map a frame for device access then the 
  * handle returned must be used to unmap the frame. This is needed to 
  * drop the ref count on the frame. 
  */ 
-static u16 pending_grant_handles[MMAP_PAGES]; 
-#define pending_handle(_idx, _i) \ 
-    (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) +
(_i)]) 
-#define BLKBACK_INVALID_HANDLE (0xFFFF) 
  
 #ifdef CONFIG_XEN_BLKDEV_TAP_BE 
 /* 
@@ -105,26 +97,79 @@ 
 static inline domid_t ID_TO_DOM(unsigned long id) { return (id >> 16);
} 
 #endif 
  
-static int do_block_io_op(blkif_t *blkif, int max_to_do); 
-static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);

+static int do_block_io_op(blkif_t *blkif); 
+static void dispatch_rw_block_io(blkif_t *blkif, 
+ blkif_request_t *req, 
+ pending_req_t *pending_req); 
 static void make_response(blkif_t *blkif, unsigned long id, 
                           unsigned short op, int st); 
  
-static void fast_flush_area(int idx, int nr_pages) 
+/****************************************************************** 
+ * misc small helpers 
+ */ 
+static pending_req_t* alloc_req(void) 
+{ 
+ pending_req_t *req = NULL; 
+ unsigned long flags; 
+ 
+ spin_lock_irqsave(&pending_free_lock, flags); 
+ if (!list_empty(&pending_free)) { 
+ req = list_entry(pending_free.next, pending_req_t, free_list); 
+ list_del(&req->free_list); 
+ } 
+ spin_unlock_irqrestore(&pending_free_lock, flags); 
+ return req; 
+} 
+ 
+static void free_req(pending_req_t *req) 
+{ 
+ unsigned long flags; 
+ int was_empty; 
+ 
+ spin_lock_irqsave(&pending_free_lock, flags); 
+ was_empty = list_empty(&pending_free); 
+ list_add(&req->free_list, &pending_free); 
+ spin_unlock_irqrestore(&pending_free_lock, flags); 
+ if (was_empty) 
+ wake_up(&pending_free_wq); 
+} 
+ 
+static void unplug_queue(blkif_t *blkif) 
+{ 
+ if (NULL == blkif->plug) 
+ return; 
+ if (blkif->plug->unplug_fn) 
+ blkif->plug->unplug_fn(blkif->plug); 
+ blk_put_queue(blkif->plug); 
+ blkif->plug = NULL; 
+} 
+ 
+static void plug_queue(blkif_t *blkif, struct bio *bio) 
+{ 
+ request_queue_t *q = bdev_get_queue(bio->bi_bdev); 
+ 
+ if (q == blkif->plug) 
+ return; 
+ unplug_queue(blkif); 
+ blk_get_queue(q); 
+ blkif->plug = q; 
+} 
+ 
+static void fast_flush_area(pending_req_t *req) 
 { 
         struct gnttab_unmap_grant_ref
unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 
         unsigned int i, invcount = 0; 
         u16 handle; 
         int ret; 
  
- for (i = 0; i < nr_pages; i++) { 
- handle = pending_handle(idx, i); 
+ for (i = 0; i < req->nr_pages; i++) { 
+ handle = pending_handle(req, i); 
                 if (handle == BLKBACK_INVALID_HANDLE) 
                         continue; 
- unmap[invcount].host_addr    = MMAP_VADDR(idx, i); 
+ unmap[invcount].host_addr    = vaddr(req, i); 
                 unmap[invcount].dev_bus_addr = 0; 
                 unmap[invcount].handle       = handle; 
- pending_handle(idx, i) = BLKBACK_INVALID_HANDLE; 
+ pending_handle(req, i) = BLKBACK_INVALID_HANDLE; 
                 invcount++; 
         } 
  
@@ -133,109 +178,56 @@ 
         BUG_ON(ret); 
 } 
  
- 
-/****************************************************************** 
- * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE 
- */ 
- 
-static struct list_head blkio_schedule_list; 
-static spinlock_t blkio_schedule_list_lock; 
- 
-static int __on_blkdev_list(blkif_t *blkif) 
-{ 
- return blkif->blkdev_list.next != NULL; 
-} 
- 
-static void remove_from_blkdev_list(blkif_t *blkif) 
-{ 
- unsigned long flags; 
- 
- if (!__on_blkdev_list(blkif)) 
- return; 
- 
- spin_lock_irqsave(&blkio_schedule_list_lock, flags); 
- if (__on_blkdev_list(blkif)) { 
- list_del(&blkif->blkdev_list); 
- blkif->blkdev_list.next = NULL; 
- blkif_put(blkif); 
- } 
- spin_unlock_irqrestore(&blkio_schedule_list_lock, flags); 
-} 
- 
-static void add_to_blkdev_list_tail(blkif_t *blkif) 
-{ 
- unsigned long flags; 
- 
- if (__on_blkdev_list(blkif)) 
- return; 
- 
- spin_lock_irqsave(&blkio_schedule_list_lock, flags); 
- if (!__on_blkdev_list(blkif) && (blkif->status == CONNECTED)) { 
- list_add_tail(&blkif->blkdev_list, &blkio_schedule_list); 
- blkif_get(blkif); 
- } 
- spin_unlock_irqrestore(&blkio_schedule_list_lock, flags); 
-} 
- 
- 
 /****************************************************************** 
  * SCHEDULER FUNCTIONS 
  */ 
  
-static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait); 
- 
-static int blkio_schedule(void *arg) 
-{ 
- DECLARE_WAITQUEUE(wq, current); 
- 
- blkif_t          *blkif; 
- struct list_head *ent; 
- 
- daemonize("xenblkd"); 
- 
+int blkif_schedule(void *arg) 
+{ 
+ blkif_t          *blkif = arg; 
+ 
+ blkif_get(blkif); 
+ printk(KERN_DEBUG "%s: started\n", current->comm); 
         for (;;) { 
- /* Wait for work to do. */ 
- add_wait_queue(&blkio_schedule_wait, &wq); 
- set_current_state(TASK_INTERRUPTIBLE); 
- if ( (NR_PENDING_REQS == MAX_PENDING_REQS) || 
-     list_empty(&blkio_schedule_list) ) 
- schedule(); 
- __set_current_state(TASK_RUNNING); 
- remove_wait_queue(&blkio_schedule_wait, &wq); 
- 
- /* Queue up a batch of requests. */ 
- while ((NR_PENDING_REQS < MAX_PENDING_REQS) && 
-       !list_empty(&blkio_schedule_list)) { 
- ent = blkio_schedule_list.next; 
- blkif = list_entry(ent, blkif_t, blkdev_list); 
- blkif_get(blkif); 
- remove_from_blkdev_list(blkif); 
- if (do_block_io_op(blkif, BATCH_PER_DOMAIN)) 
- add_to_blkdev_list_tail(blkif); 
- blkif_put(blkif); 
- } 
- 
- /* Push the batch through to disc. */ 
- flush_plugged_queue(); 
- } 
-} 
- 
-static void maybe_trigger_blkio_schedule(void) 
-{ 
- /* 
- * Needed so that two processes, which together make the following 
- * predicate true, don't both read stale values and evaluate the 
- * predicate incorrectly. Incredibly unlikely to stall the scheduler 
- * on x86, but... 
- */ 
- smp_mb(); 
- 
- if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && 
-    !list_empty(&blkio_schedule_list)) 
- wake_up(&blkio_schedule_wait); 
-} 
- 
- 
+ if (!atomic_read(&blkif->io_pending)) { 
+ /* Wait for work to do or requests to exit. */ 
+ if (kthread_should_stop()) 
+ break; 
+ wait_event_interruptible(blkif->wq, 
+ atomic_read(&blkif->io_pending) || 
+ kthread_should_stop()); 
+ } else if (list_empty(&pending_free)) { 
+ /* Wait for pending_req becoming available. */ 
+ wait_event_interruptible(pending_free_wq, 
+ !list_empty(&pending_free)); 
+ } 
+ 
+ /* Schedule I/O */ 
+ atomic_set(&blkif->io_pending, 0); 
+ if (do_block_io_op(blkif)) 
+ atomic_inc(&blkif->io_pending); 
+ unplug_queue(blkif); 
+ 
+#if 0 
+ /* Print stats for performance debugging. */ 
+ if (time_after(jiffies, blkif->st_print)) { 
+ printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d\n", 
+       current->comm, blkif->st_oo_req, 
+       blkif->st_rd_req, blkif->st_wr_req); 
+ blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); 
+ blkif->st_rd_req = 0; 
+ blkif->st_wr_req = 0; 
+ blkif->st_oo_req = 0; 
+ } 
+#endif 
+ } 
+ 
+ /* bye folks, and thanks for all the fish ;) */ 
+ printk(KERN_DEBUG "%s: exiting\n", current->comm); 
+ blkif->xenblkd = NULL; 
+ blkif_put(blkif); 
+ return 0; 
+} 
  
 /****************************************************************** 
  * COMPLETION CALLBACK -- Called as bh->b_end_io() 
@@ -243,8 +235,6 @@ 
  
 static void __end_block_io_op(pending_req_t *pending_req, int uptodate)

 { 
- unsigned long flags; 
- 
         /* An error fails the entire request. */ 
         if (!uptodate) { 
                 DPRINTK("Buffer not up-to-date at end of operation\n");

@@ -252,15 +242,11 @@ 
         } 
  
         if (atomic_dec_and_test(&pending_req->pendcnt)) { 
- int pending_idx = pending_req - pending_reqs; 
- fast_flush_area(pending_idx, pending_req->nr_pages); 
+ fast_flush_area(pending_req); 
                 make_response(pending_req->blkif, pending_req->id, 
                               pending_req->operation,
pending_req->status); 
                 blkif_put(pending_req->blkif); 
- spin_lock_irqsave(&pend_prod_lock, flags); 
- pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; 
- spin_unlock_irqrestore(&pend_prod_lock, flags); 
- maybe_trigger_blkio_schedule(); 
+ free_req(pending_req); 
         } 
 } 
  
@@ -281,8 +267,10 @@ 
 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs) 
 { 
         blkif_t *blkif = dev_id; 
- add_to_blkdev_list_tail(blkif); 
- maybe_trigger_blkio_schedule(); 
+ 
+ atomic_inc(&blkif->io_pending); 
+ if (blkif->status == CONNECTED) 
+ wake_up(&blkif->wq); 
         return IRQ_HANDLED; 
 } 
  
@@ -292,10 +280,11 @@ 
  * DOWNWARD CALLS -- These interface with the block-device layer
proper. 
  */ 
  
-static int do_block_io_op(blkif_t *blkif, int max_to_do) 
+static int do_block_io_op(blkif_t *blkif) 
 { 
         blkif_back_ring_t *blk_ring = &blkif->blk_ring; 
         blkif_request_t *req; 
+ pending_req_t *pending_req; 
         RING_IDX i, rp; 
         int more_to_do = 0; 
  
@@ -305,24 +294,30 @@ 
         for (i = blk_ring->req_cons; 
              (i != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, i); 
              i++) { 
- if ((max_to_do-- == 0) || 
-    (NR_PENDING_REQS == MAX_PENDING_REQS)) { 
+ 
+ pending_req = alloc_req(); 
+ if (NULL == pending_req) { 
+ blkif->st_oo_req++; 
                         more_to_do = 1; 
                         break; 
                 } 
-         
+ 
                 req = RING_GET_REQUEST(blk_ring, i); 
                 switch (req->operation) { 
                 case BLKIF_OP_READ: 
+ blkif->st_rd_req++; 
+ dispatch_rw_block_io(blkif, req, pending_req); 
+ break; 
                 case BLKIF_OP_WRITE: 
- dispatch_rw_block_io(blkif, req); 
+ blkif->st_wr_req++; 
+ dispatch_rw_block_io(blkif, req, pending_req); 
                         break; 
- 
                 default: 
                         DPRINTK("error: unknown block io operation
[%d]\n", 
                                 req->operation); 
                         make_response(blkif, req->id, req->operation, 
                                       BLKIF_RSP_ERROR); 
+ free_req(pending_req); 
                         break; 
                 } 
         } 
@@ -331,13 +326,13 @@ 
         return more_to_do; 
 } 
  
-static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req) 
+static void dispatch_rw_block_io(blkif_t *blkif, 
+ blkif_request_t *req, 
+ pending_req_t *pending_req) 
 { 
         extern void ll_rw_block(int rw, int nr, struct buffer_head *
bhs[]); 
         int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE :
READ; 
         unsigned long fas = 0; 
- int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; 
- pending_req_t *pending_req; 
         struct gnttab_map_grant_ref
map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 
         struct phys_req preq; 
         struct { 
@@ -345,31 +340,35 @@ 
         } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 
         unsigned int nseg; 
         struct bio *bio = NULL,
*biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 
- int nbio = 0; 
- request_queue_t *q; 
- int ret, errors = 0; 
+ int ret, i, nbio = 0; 
  
         /* Check that number of segments is sane. */ 
         nseg = req->nr_segments; 
         if (unlikely(nseg == 0) || 
             unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { 
                 DPRINTK("Bad number of segments in request (%d)\n",
nseg); 
- goto bad_descriptor; 
+ goto fail_response; 
         } 
  
         preq.dev           = req->handle; 
         preq.sector_number = req->sector_number; 
         preq.nr_sects      = 0; 
  
+ pending_req->blkif     = blkif; 
+ pending_req->id        = req->id; 
+ pending_req->operation = operation; 
+ pending_req->status    = BLKIF_RSP_OKAY; 
+ pending_req->nr_pages  = nseg; 
+ 
         for (i = 0; i < nseg; i++) { 
                 fas         = req->frame_and_sects[i]; 
                 seg[i].nsec = blkif_last_sect(fas) -
blkif_first_sect(fas) + 1; 
  
                 if (seg[i].nsec <= 0) 
- goto bad_descriptor; 
+ goto fail_response; 
                 preq.nr_sects += seg[i].nsec; 
  
- map[i].host_addr = MMAP_VADDR(pending_idx, i); 
+ map[i].host_addr = vaddr(pending_req, i); 
                 map[i].dom = blkif->domid; 
                 map[i].ref = blkif_gref_from_fas(fas); 
                 map[i].flags = GNTMAP_host_map; 
@@ -381,27 +380,23 @@ 
         BUG_ON(ret); 
  
         for (i = 0; i < nseg; i++) { 
- if (likely(map[i].handle >= 0)) { 
- pending_handle(pending_idx, i) = map[i].handle; 
+ if (unlikely(map[i].handle < 0)) { 
+ DPRINTK("invalid buffer -- could not remap it\n"); 
+ goto fail_flush; 
+ } 
+ 
+ pending_handle(pending_req, i) = map[i].handle; 
 #ifdef __ia64__ 
- MMAP_VADDR(pending_idx,i) = gnttab_map_vaddr(map[i]); 
+ pending_vaddrs[vaddr_pagenr(req, seg)] = 
+ = gnttab_map_vaddr(map[i]); 
 #else 
- phys_to_machine_mapping[__pa(MMAP_VADDR( 
- pending_idx, i)) >> PAGE_SHIFT] = 
- FOREIGN_FRAME(map[i].dev_bus_addr>>PAGE_SHIFT); 
+ phys_to_machine_mapping[__pa(vaddr( 
+ pending_req, i)) >> PAGE_SHIFT] = 
+ FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT); 
 #endif 
- fas        = req->frame_and_sects[i]; 
- seg[i].buf = map[i].dev_bus_addr | 
- (blkif_first_sect(fas) << 9); 
- } else { 
- errors++; 
- } 
- } 
- 
- if (errors) { 
- DPRINTK("invalid buffer -- could not remap it\n"); 
- fast_flush_area(pending_idx, nseg); 
- goto bad_descriptor; 
+ fas         = req->frame_and_sects[i]; 
+ seg[i].buf  = map[i].dev_bus_addr | 
+ (blkif_first_sect(fas) << 9); 
         } 
  
         if (vbd_translate(&preq, blkif, operation) != 0) { 
@@ -409,37 +404,25 @@ 
                         operation == READ ? "read" : "write", 
                         preq.sector_number, 
                         preq.sector_number + preq.nr_sects, preq.dev); 
- goto bad_descriptor; 
- } 
- 
- pending_req = &pending_reqs[pending_idx]; 
- pending_req->blkif     = blkif; 
- pending_req->id        = req->id; 
- pending_req->operation = operation; 
- pending_req->status    = BLKIF_RSP_OKAY; 
- pending_req->nr_pages  = nseg; 
+ goto fail_flush; 
+ } 
  
         for (i = 0; i < nseg; i++) { 
                 if (((int)preq.sector_number|(int)seg[i].nsec) & 
                     ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) { 
                         DPRINTK("Misaligned I/O request from domain
%d", 
                                 blkif->domid); 
- goto cleanup_and_fail; 
+ goto fail_put_bio; 
                 } 
  
                 while ((bio == NULL) || 
                        (bio_add_page(bio, 
-     virt_to_page(MMAP_VADDR(pending_idx, i)), 
+     virt_to_page(vaddr(pending_req, i)), 
                                      seg[i].nsec << 9, 
                                      seg[i].buf & ~PAGE_MASK) == 0)) { 
                         bio = biolist[nbio++] = bio_alloc(GFP_KERNEL,
nseg-i); 
- if (unlikely(bio == NULL)) { 
- cleanup_and_fail: 
- for (i = 0; i < (nbio-1); i++) 
- bio_put(biolist[i]); 
- fast_flush_area(pending_idx, nseg); 
- goto bad_descriptor; 
- } 
+ if (unlikely(bio == NULL)) 
+ goto fail_put_bio; 
                  
                         bio->bi_bdev    = preq.bdev; 
                         bio->bi_private = pending_req; 
@@ -450,14 +433,8 @@ 
                 preq.sector_number += seg[i].nsec; 
         } 
  
- if ((q = bdev_get_queue(bio->bi_bdev)) != plugged_queue) { 
- flush_plugged_queue(); 
- blk_get_queue(q); 
- plugged_queue = q; 
- } 
- 
+ plug_queue(blkif, bio); 
         atomic_set(&pending_req->pendcnt, nbio); 
- pending_cons++; 
         blkif_get(blkif); 
  
         for (i = 0; i < nbio; i++) 
@@ -465,8 +442,14 @@ 
  
         return; 
  
- bad_descriptor: 
+ fail_put_bio: 
+ for (i = 0; i < (nbio-1); i++) 
+ bio_put(biolist[i]); 
+ fail_flush: 
+ fast_flush_area(pending_req); 
+ fail_response: 
         make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);

+ free_req(pending_req); 
 } 
  
  
@@ -498,56 +481,47 @@ 
         notify_remote_via_irq(blkif->irq); 
 } 
  
-void blkif_deschedule(blkif_t *blkif) 
-{ 
- remove_from_blkdev_list(blkif); 
-} 
- 
 static int __init blkif_init(void) 
 { 
+ struct page *page; 
         int i; 
- struct page *page; 
- int ret; 
- 
- for (i = 0; i < MMAP_PAGES; i++) 
- pending_grant_handles[i] = BLKBACK_INVALID_HANDLE; 
- 
- if (xen_init() < 0) 
- return -ENODEV; 
+ 
+ mmap_pages            = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST; 
+ pending_reqs          = kmalloc(sizeof(pending_reqs[0]) * 
+ blkif_reqs, GFP_KERNEL); 
+ pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) * 
+ mmap_pages, GFP_KERNEL); 
+ pending_vaddrs        = kmalloc(sizeof(pending_vaddrs[0]) * 
+ mmap_pages, GFP_KERNEL); 
+ if (!pending_reqs || !pending_grant_handles || !pending_vaddrs) { 
+ printk("%s: out of memory\n", __FUNCTION__); 
+ return -1; 
+ } 
  
         blkif_interface_init(); 
- 
+ 
 #ifdef __ia64__ 
-    { 
         extern unsigned long
alloc_empty_foreign_map_page_range(unsigned long pages); 
- int i; 
- 
- mmap_vstart =  alloc_empty_foreign_map_page_range(MMAP_PAGES); 
- printk("Allocated mmap_vstart: 0x%lx\n", mmap_vstart); 
- for(i = 0; i < MMAP_PAGES; i++) 
-    pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT); 
- BUG_ON(mmap_vstart == NULL); 
-    } 
-#else 
- page = balloon_alloc_empty_page_range(MMAP_PAGES); 
+ mmap_vstart = (unsigned
long)alloc_empty_foreign_map_page_range(mmap_pages); 
+#else /* ! ia64 */ 
+ page = balloon_alloc_empty_page_range(mmap_pages); 
         BUG_ON(page == NULL); 
         mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); 
 #endif 
- 
- pending_cons = 0; 
- pending_prod = MAX_PENDING_REQS; 
+ printk("%s: reqs=%d, pages=%d, mmap_vstart=0x%lx\n", 
+       __FUNCTION__, blkif_reqs, mmap_pages, mmap_vstart); 
+ BUG_ON(mmap_vstart == 0); 
+ for (i = 0; i < mmap_pages; i++) 
+ pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT); 
+ 
+ memset(pending_grant_handles,  BLKBACK_INVALID_HANDLE, mmap_pages); 
         memset(pending_reqs, 0, sizeof(pending_reqs)); 
- for (i = 0; i < MAX_PENDING_REQS; i++) 
- pending_ring[i] = i; 
+ INIT_LIST_HEAD(&pending_free); 
+ 
+ for (i = 0; i < blkif_reqs; i++) 
+ list_add_tail(&pending_reqs[i].free_list, &pending_free); 
      
- spin_lock_init(&blkio_schedule_list_lock); 
- INIT_LIST_HEAD(&blkio_schedule_list); 
- 
- ret = kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES); 
- BUG_ON(ret < 0); 
- 
         blkif_xenbus_init(); 
- 
         return 0; 
 } 
  
diff -r abbe3df33774 linux-2.6-xen-sparse/drivers/xen/blkback/common.h 
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/common.h Tue Nov  8
17:39:58 2005 
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/common.h Wed Nov  9
13:45:37 2005 
@@ -56,9 +56,19 @@ 
         /* Is this a blktap frontend */ 
         unsigned int     is_blktap; 
 #endif 
- struct list_head blkdev_list; 
         spinlock_t       blk_ring_lock; 
         atomic_t         refcnt; 
+ 
+ wait_queue_head_t   wq; 
+ struct task_struct  *xenblkd; 
+ atomic_t            io_pending; 
+ request_queue_t     *plug; 
+ 
+ /* statistics */ 
+ unsigned long       st_print; 
+ int                 st_rd_req; 
+ int                 st_wr_req; 
+ int                 st_oo_req; 
  
         struct work_struct free_work; 
  
@@ -97,11 +107,10 @@ 
  
 void blkif_interface_init(void); 
  
-void blkif_deschedule(blkif_t *blkif); 
- 
 void blkif_xenbus_init(void); 
  
 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs); 
+int blkif_schedule(void *arg); 
  
 #endif /* __BLKIF__BACKEND__COMMON_H__ */ 
  
diff -r abbe3df33774
linux-2.6-xen-sparse/drivers/xen/blkback/interface.c 
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c Tue Nov  8
17:39:58 2005 
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c Wed Nov  9
13:45:37 2005 
@@ -24,6 +24,8 @@ 
         blkif->status = DISCONNECTED; 
         spin_lock_init(&blkif->blk_ring_lock); 
         atomic_set(&blkif->refcnt, 1); 
+ init_waitqueue_head(&blkif->wq); 
+ blkif->st_print = jiffies; 
  
         return blkif; 
 } 
diff -r abbe3df33774 linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c 
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c Tue Nov  8
17:39:58 2005 
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c Wed Nov  9
13:45:37 2005 
@@ -17,6 +17,7 @@ 
 */ 
 #include <stdarg.h> 
 #include <linux/module.h> 
+#include <linux/kthread.h> 
 #include <asm-xen/xenbus.h> 
 #include "common.h" 
  
@@ -46,8 +47,11 @@ 
         if (be->watch.node) 
                 unregister_xenbus_watch(&be->watch); 
         unregister_xenbus_watch(&be->backend_watch); 
- if (be->blkif) 
+ if (be->blkif) { 
+ if (be->blkif->xenblkd) 
+ kthread_stop(be->blkif->xenblkd); 
                 blkif_put(be->blkif); 
+ } 
         if (be->frontpath) 
                 kfree(be->frontpath); 
         kfree(be); 
@@ -198,6 +202,16 @@ 
                         be->blkif = NULL; 
                         xenbus_dev_error(dev, err, 
                                          "creating vbd structure"); 
+ return; 
+ } 
+ 
+ be->blkif->xenblkd = kthread_run(blkif_schedule, be->blkif, 
+ "xenblkd %d/%04lx", 
+ be->blkif->domid, be->pdev); 
+ if (IS_ERR(be->blkif->xenblkd)) { 
+ err = PTR_ERR(be->blkif->xenblkd); 
+ be->blkif->xenblkd = NULL; 
+ xenbus_dev_error(dev, err, "start xenblkd"); 
                         return; 
                 } 
  
_______________________________________________ 
Xen-devel mailing list 
Xen-devel@... 
http://lists.xensource.com/xen-devel 

_______________________________________________
Xen-ia64-devel mailing list
Xen-ia64-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-ia64-devel
WARNING - OLD ARCHIVES

xen-ia64-devel

[Xen-ia64-devel] Re: [Xen-devel] [patch] CFQ for xen domains