WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-ia64-devel

[Xen-ia64-devel] Re: [Xen-devel] [patch] CFQ for xen domains

To: <xen-ia64-devel@xxxxxxxxxxxxxxxxxxx>
Subject: [Xen-ia64-devel] Re: [Xen-devel] [patch] CFQ for xen domains
From: "Magenheimer, Dan (HP Labs Fort Collins)" <dan.magenheimer@xxxxxx>
Date: Thu, 10 Nov 2005 18:00:15 -0800
Cc: Gerd Knorr <kraxel@xxxxxxx>
Delivery-date: Fri, 11 Nov 2005 02:00:24 +0000
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-ia64-devel-request@lists.xensource.com?subject=help>
List-id: Discussion of the ia64 port of Xen <xen-ia64-devel.lists.xensource.com>
List-post: <mailto:xen-ia64-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-ia64-devel>, <mailto:xen-ia64-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-ia64-devel>, <mailto:xen-ia64-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-ia64-devel-bounces@xxxxxxxxxxxxxxxxxxx
Thread-index: AcXmY6jkzrge736TSW6X4ooKlrne+A==
Thread-topic: Re: [Xen-devel] [patch] CFQ for xen domains
Thought I should point out this proposed blkback patch
as it is fairly extensive and may cause problems for
Xen/ia64 multiple domains.  Perhaps Kevin or Matt can
look it over before it gets applied?  On quick glance I see that
the removal of xen_init() is probably a mistake.

http://lists.xensource.com/archives/html/xen-devel/2005-11/msg00371.html

====

> Does 'xm save / xm restore' work with this patch (at least as well as
it 
> currently does ;-)? 

xm save/restore still doesn't work for me, neither with the sparse tree 
nor with the linux-2.6 repository, so I can't try.  I can't see any 
reason why it should become worse with that patch though. 

I've resynced the blkback threading patch with the latest sparse tree, 
here we are.  Changes: 

   * One thread per blkif.  The I/O scheduler can do a better job that 
     way, also you can use ionice on the blkback threads to adjust the 
     block I/O priorities for the domain. 
   * Various stuff has been moved from global variables into blkif_t. 
   * The scary allocation ring for pending_req's is gone and has been 
     replaced by a free list. 
   * made dispatch_rw_block_io() reentrant. 
   * general linux coding style cleanup, at least for the code I've 
     touched anyway. 
   * number of outstanding requests is runtime-configurable now. 
   * made the ia64 #ifdefs smaller and dropped one.  It should still 
     work on ia64 in theory, but would be great if the ia64 folks 
     can have a look ... 

cheers, 

   Gerd 

diff -r abbe3df33774 linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c 
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c Tue Nov  8
17:39:58 2005 
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c Wed Nov  9
13:45:37 2005 
@@ -12,6 +12,8 @@ 
  */ 
  
 #include <linux/spinlock.h> 
+#include <linux/kthread.h> 
+#include <linux/list.h> 
 #include <asm-xen/balloon.h> 
 #include <asm/hypervisor.h> 
 #include "common.h" 
@@ -21,26 +23,21 @@ 
  * pulled from a communication ring are quite likely to end up being
part of 
  * the same scatter/gather request at the disc. 
  * 
- * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **

+ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW ** 
+ * 
  * This will increase the chances of being able to write whole tracks. 
  * 64 should be enough to keep us competitive with Linux. 
  */ 
-#define MAX_PENDING_REQS 64 
-#define BATCH_PER_DOMAIN 16 
- 
-static unsigned long mmap_vstart; 
-#define MMAP_PAGES \ 
- (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) 
-#ifdef __ia64__ 
-static void *pending_vaddrs[MMAP_PAGES]; 
-#define MMAP_VADDR(_idx, _i) \ 
- (unsigned long)(pending_vaddrs[((_idx) *
BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)]) 
-#else 
-#define MMAP_VADDR(_req,_seg) \ 
- (mmap_vstart + \ 
- ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \ 
- ((_seg) * PAGE_SIZE)) 
-#endif 
+static int blkif_reqs = 64; 
+static int mmap_pages; 
+ 
+static int __init set_blkif_reqs(char *str) 
+{ 
+ get_option(&str, &blkif_reqs); 
+ return 1; 
+} 
+__setup("blkif_reqs=", set_blkif_reqs); 
+ 
  
 /* 
  * Each outstanding request that we've passed to the lower device
layers has a 
@@ -55,43 +52,38 @@ 
         atomic_t       pendcnt; 
         unsigned short operation; 
         int            status; 
+ struct list_head free_list; 
 } pending_req_t; 
  
-/* 
- * We can't allocate pending_req's in order, since they may complete
out of 
- * order. We therefore maintain an allocation ring. This ring also
indicates 
- * when enough work has been passed down -- at that point the
allocation ring 
- * will be empty. 
- */ 
-static pending_req_t pending_reqs[MAX_PENDING_REQS]; 
-static unsigned char pending_ring[MAX_PENDING_REQS]; 
-static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED; 
-/* NB. We use a different index type to differentiate from shared blk
rings. */ 
-typedef unsigned int PEND_RING_IDX; 
-#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1)) 
-static PEND_RING_IDX pending_prod, pending_cons; 
-#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod +
pending_cons) 
- 
-static request_queue_t *plugged_queue; 
-static inline void flush_plugged_queue(void) 
-{ 
- request_queue_t *q = plugged_queue; 
- if (q != NULL) { 
- if ( q->unplug_fn != NULL ) 
- q->unplug_fn(q); 
- blk_put_queue(q); 
- plugged_queue = NULL; 
- } 
-} 
+static pending_req_t *pending_reqs; 
+static struct list_head pending_free; 
+static spinlock_t pending_free_lock = SPIN_LOCK_UNLOCKED; 
+static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq); 
+ 
+#define BLKBACK_INVALID_HANDLE (0xFFFF) 
+ 
+static unsigned long mmap_vstart; 
+static unsigned long *pending_vaddrs; 
+static u16 *pending_grant_handles; 
+ 
+static inline int vaddr_pagenr(pending_req_t *req, int seg) 
+{ 
+ return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg; 
+} 
+ 
+static inline unsigned long vaddr(pending_req_t *req, int seg) 
+{ 
+ return pending_vaddrs[vaddr_pagenr(req, seg)]; 
+} 
+ 
+#define pending_handle(_req, _seg) \ 
+ (pending_grant_handles[vaddr_pagenr(_req, _seg)]) 
+ 
  
 /* When using grant tables to map a frame for device access then the 
  * handle returned must be used to unmap the frame. This is needed to 
  * drop the ref count on the frame. 
  */ 
-static u16 pending_grant_handles[MMAP_PAGES]; 
-#define pending_handle(_idx, _i) \ 
-    (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) +
(_i)]) 
-#define BLKBACK_INVALID_HANDLE (0xFFFF) 
  
 #ifdef CONFIG_XEN_BLKDEV_TAP_BE 
 /* 
@@ -105,26 +97,79 @@ 
 static inline domid_t ID_TO_DOM(unsigned long id) { return (id >> 16);
} 
 #endif 
  
-static int do_block_io_op(blkif_t *blkif, int max_to_do); 
-static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);

+static int do_block_io_op(blkif_t *blkif); 
+static void dispatch_rw_block_io(blkif_t *blkif, 
+ blkif_request_t *req, 
+ pending_req_t *pending_req); 
 static void make_response(blkif_t *blkif, unsigned long id, 
                           unsigned short op, int st); 
  
-static void fast_flush_area(int idx, int nr_pages) 
+/****************************************************************** 
+ * misc small helpers 
+ */ 
+static pending_req_t* alloc_req(void) 
+{ 
+ pending_req_t *req = NULL; 
+ unsigned long flags; 
+ 
+ spin_lock_irqsave(&pending_free_lock, flags); 
+ if (!list_empty(&pending_free)) { 
+ req = list_entry(pending_free.next, pending_req_t, free_list); 
+ list_del(&req->free_list); 
+ } 
+ spin_unlock_irqrestore(&pending_free_lock, flags); 
+ return req; 
+} 
+ 
+static void free_req(pending_req_t *req) 
+{ 
+ unsigned long flags; 
+ int was_empty; 
+ 
+ spin_lock_irqsave(&pending_free_lock, flags); 
+ was_empty = list_empty(&pending_free); 
+ list_add(&req->free_list, &pending_free); 
+ spin_unlock_irqrestore(&pending_free_lock, flags); 
+ if (was_empty) 
+ wake_up(&pending_free_wq); 
+} 
+ 
+static void unplug_queue(blkif_t *blkif) 
+{ 
+ if (NULL == blkif->plug) 
+ return; 
+ if (blkif->plug->unplug_fn) 
+ blkif->plug->unplug_fn(blkif->plug); 
+ blk_put_queue(blkif->plug); 
+ blkif->plug = NULL; 
+} 
+ 
+static void plug_queue(blkif_t *blkif, struct bio *bio) 
+{ 
+ request_queue_t *q = bdev_get_queue(bio->bi_bdev); 
+ 
+ if (q == blkif->plug) 
+ return; 
+ unplug_queue(blkif); 
+ blk_get_queue(q); 
+ blkif->plug = q; 
+} 
+ 
+static void fast_flush_area(pending_req_t *req) 
 { 
         struct gnttab_unmap_grant_ref
unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 
         unsigned int i, invcount = 0; 
         u16 handle; 
         int ret; 
  
- for (i = 0; i < nr_pages; i++) { 
- handle = pending_handle(idx, i); 
+ for (i = 0; i < req->nr_pages; i++) { 
+ handle = pending_handle(req, i); 
                 if (handle == BLKBACK_INVALID_HANDLE) 
                         continue; 
- unmap[invcount].host_addr    = MMAP_VADDR(idx, i); 
+ unmap[invcount].host_addr    = vaddr(req, i); 
                 unmap[invcount].dev_bus_addr = 0; 
                 unmap[invcount].handle       = handle; 
- pending_handle(idx, i) = BLKBACK_INVALID_HANDLE; 
+ pending_handle(req, i) = BLKBACK_INVALID_HANDLE; 
                 invcount++; 
         } 
  
@@ -133,109 +178,56 @@ 
         BUG_ON(ret); 
 } 
  
- 
-/****************************************************************** 
- * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE 
- */ 
- 
-static struct list_head blkio_schedule_list; 
-static spinlock_t blkio_schedule_list_lock; 
- 
-static int __on_blkdev_list(blkif_t *blkif) 
-{ 
- return blkif->blkdev_list.next != NULL; 
-} 
- 
-static void remove_from_blkdev_list(blkif_t *blkif) 
-{ 
- unsigned long flags; 
- 
- if (!__on_blkdev_list(blkif)) 
- return; 
- 
- spin_lock_irqsave(&blkio_schedule_list_lock, flags); 
- if (__on_blkdev_list(blkif)) { 
- list_del(&blkif->blkdev_list); 
- blkif->blkdev_list.next = NULL; 
- blkif_put(blkif); 
- } 
- spin_unlock_irqrestore(&blkio_schedule_list_lock, flags); 
-} 
- 
-static void add_to_blkdev_list_tail(blkif_t *blkif) 
-{ 
- unsigned long flags; 
- 
- if (__on_blkdev_list(blkif)) 
- return; 
- 
- spin_lock_irqsave(&blkio_schedule_list_lock, flags); 
- if (!__on_blkdev_list(blkif) && (blkif->status == CONNECTED)) { 
- list_add_tail(&blkif->blkdev_list, &blkio_schedule_list); 
- blkif_get(blkif); 
- } 
- spin_unlock_irqrestore(&blkio_schedule_list_lock, flags); 
-} 
- 
- 
 /****************************************************************** 
  * SCHEDULER FUNCTIONS 
  */ 
  
-static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait); 
- 
-static int blkio_schedule(void *arg) 
-{ 
- DECLARE_WAITQUEUE(wq, current); 
- 
- blkif_t          *blkif; 
- struct list_head *ent; 
- 
- daemonize("xenblkd"); 
- 
+int blkif_schedule(void *arg) 
+{ 
+ blkif_t          *blkif = arg; 
+ 
+ blkif_get(blkif); 
+ printk(KERN_DEBUG "%s: started\n", current->comm); 
         for (;;) { 
- /* Wait for work to do. */ 
- add_wait_queue(&blkio_schedule_wait, &wq); 
- set_current_state(TASK_INTERRUPTIBLE); 
- if ( (NR_PENDING_REQS == MAX_PENDING_REQS) || 
-     list_empty(&blkio_schedule_list) ) 
- schedule(); 
- __set_current_state(TASK_RUNNING); 
- remove_wait_queue(&blkio_schedule_wait, &wq); 
- 
- /* Queue up a batch of requests. */ 
- while ((NR_PENDING_REQS < MAX_PENDING_REQS) && 
-       !list_empty(&blkio_schedule_list)) { 
- ent = blkio_schedule_list.next; 
- blkif = list_entry(ent, blkif_t, blkdev_list); 
- blkif_get(blkif); 
- remove_from_blkdev_list(blkif); 
- if (do_block_io_op(blkif, BATCH_PER_DOMAIN)) 
- add_to_blkdev_list_tail(blkif); 
- blkif_put(blkif); 
- } 
- 
- /* Push the batch through to disc. */ 
- flush_plugged_queue(); 
- } 
-} 
- 
-static void maybe_trigger_blkio_schedule(void) 
-{ 
- /* 
- * Needed so that two processes, which together make the following 
- * predicate true, don't both read stale values and evaluate the 
- * predicate incorrectly. Incredibly unlikely to stall the scheduler 
- * on x86, but... 
- */ 
- smp_mb(); 
- 
- if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && 
-    !list_empty(&blkio_schedule_list)) 
- wake_up(&blkio_schedule_wait); 
-} 
- 
- 
+ if (!atomic_read(&blkif->io_pending)) { 
+ /* Wait for work to do or requests to exit. */ 
+ if (kthread_should_stop()) 
+ break; 
+ wait_event_interruptible(blkif->wq, 
+ atomic_read(&blkif->io_pending) || 
+ kthread_should_stop()); 
+ } else if (list_empty(&pending_free)) { 
+ /* Wait for pending_req becoming available. */ 
+ wait_event_interruptible(pending_free_wq, 
+ !list_empty(&pending_free)); 
+ } 
+ 
+ /* Schedule I/O */ 
+ atomic_set(&blkif->io_pending, 0); 
+ if (do_block_io_op(blkif)) 
+ atomic_inc(&blkif->io_pending); 
+ unplug_queue(blkif); 
+ 
+#if 0 
+ /* Print stats for performance debugging. */ 
+ if (time_after(jiffies, blkif->st_print)) { 
+ printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d\n", 
+       current->comm, blkif->st_oo_req, 
+       blkif->st_rd_req, blkif->st_wr_req); 
+ blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); 
+ blkif->st_rd_req = 0; 
+ blkif->st_wr_req = 0; 
+ blkif->st_oo_req = 0; 
+ } 
+#endif 
+ } 
+ 
+ /* bye folks, and thanks for all the fish ;) */ 
+ printk(KERN_DEBUG "%s: exiting\n", current->comm); 
+ blkif->xenblkd = NULL; 
+ blkif_put(blkif); 
+ return 0; 
+} 
  
 /****************************************************************** 
  * COMPLETION CALLBACK -- Called as bh->b_end_io() 
@@ -243,8 +235,6 @@ 
  
 static void __end_block_io_op(pending_req_t *pending_req, int uptodate)

 { 
- unsigned long flags; 
- 
         /* An error fails the entire request. */ 
         if (!uptodate) { 
                 DPRINTK("Buffer not up-to-date at end of operation\n");

@@ -252,15 +242,11 @@ 
         } 
  
         if (atomic_dec_and_test(&pending_req->pendcnt)) { 
- int pending_idx = pending_req - pending_reqs; 
- fast_flush_area(pending_idx, pending_req->nr_pages); 
+ fast_flush_area(pending_req); 
                 make_response(pending_req->blkif, pending_req->id, 
                               pending_req->operation,
pending_req->status); 
                 blkif_put(pending_req->blkif); 
- spin_lock_irqsave(&pend_prod_lock, flags); 
- pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; 
- spin_unlock_irqrestore(&pend_prod_lock, flags); 
- maybe_trigger_blkio_schedule(); 
+ free_req(pending_req); 
         } 
 } 
  
@@ -281,8 +267,10 @@ 
 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs) 
 { 
         blkif_t *blkif = dev_id; 
- add_to_blkdev_list_tail(blkif); 
- maybe_trigger_blkio_schedule(); 
+ 
+ atomic_inc(&blkif->io_pending); 
+ if (blkif->status == CONNECTED) 
+ wake_up(&blkif->wq); 
         return IRQ_HANDLED; 
 } 
  
@@ -292,10 +280,11 @@ 
  * DOWNWARD CALLS -- These interface with the block-device layer
proper. 
  */ 
  
-static int do_block_io_op(blkif_t *blkif, int max_to_do) 
+static int do_block_io_op(blkif_t *blkif) 
 { 
         blkif_back_ring_t *blk_ring = &blkif->blk_ring; 
         blkif_request_t *req; 
+ pending_req_t *pending_req; 
         RING_IDX i, rp; 
         int more_to_do = 0; 
  
@@ -305,24 +294,30 @@ 
         for (i = blk_ring->req_cons; 
              (i != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, i); 
              i++) { 
- if ((max_to_do-- == 0) || 
-    (NR_PENDING_REQS == MAX_PENDING_REQS)) { 
+ 
+ pending_req = alloc_req(); 
+ if (NULL == pending_req) { 
+ blkif->st_oo_req++; 
                         more_to_do = 1; 
                         break; 
                 } 
-         
+ 
                 req = RING_GET_REQUEST(blk_ring, i); 
                 switch (req->operation) { 
                 case BLKIF_OP_READ: 
+ blkif->st_rd_req++; 
+ dispatch_rw_block_io(blkif, req, pending_req); 
+ break; 
                 case BLKIF_OP_WRITE: 
- dispatch_rw_block_io(blkif, req); 
+ blkif->st_wr_req++; 
+ dispatch_rw_block_io(blkif, req, pending_req); 
                         break; 
- 
                 default: 
                         DPRINTK("error: unknown block io operation
[%d]\n", 
                                 req->operation); 
                         make_response(blkif, req->id, req->operation, 
                                       BLKIF_RSP_ERROR); 
+ free_req(pending_req); 
                         break; 
                 } 
         } 
@@ -331,13 +326,13 @@ 
         return more_to_do; 
 } 
  
-static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req) 
+static void dispatch_rw_block_io(blkif_t *blkif, 
+ blkif_request_t *req, 
+ pending_req_t *pending_req) 
 { 
         extern void ll_rw_block(int rw, int nr, struct buffer_head *
bhs[]); 
         int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE :
READ; 
         unsigned long fas = 0; 
- int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; 
- pending_req_t *pending_req; 
         struct gnttab_map_grant_ref
map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 
         struct phys_req preq; 
         struct { 
@@ -345,31 +340,35 @@ 
         } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 
         unsigned int nseg; 
         struct bio *bio = NULL,
*biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 
- int nbio = 0; 
- request_queue_t *q; 
- int ret, errors = 0; 
+ int ret, i, nbio = 0; 
  
         /* Check that number of segments is sane. */ 
         nseg = req->nr_segments; 
         if (unlikely(nseg == 0) || 
             unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { 
                 DPRINTK("Bad number of segments in request (%d)\n",
nseg); 
- goto bad_descriptor; 
+ goto fail_response; 
         } 
  
         preq.dev           = req->handle; 
         preq.sector_number = req->sector_number; 
         preq.nr_sects      = 0; 
  
+ pending_req->blkif     = blkif; 
+ pending_req->id        = req->id; 
+ pending_req->operation = operation; 
+ pending_req->status    = BLKIF_RSP_OKAY; 
+ pending_req->nr_pages  = nseg; 
+ 
         for (i = 0; i < nseg; i++) { 
                 fas         = req->frame_and_sects[i]; 
                 seg[i].nsec = blkif_last_sect(fas) -
blkif_first_sect(fas) + 1; 
  
                 if (seg[i].nsec <= 0) 
- goto bad_descriptor; 
+ goto fail_response; 
                 preq.nr_sects += seg[i].nsec; 
  
- map[i].host_addr = MMAP_VADDR(pending_idx, i); 
+ map[i].host_addr = vaddr(pending_req, i); 
                 map[i].dom = blkif->domid; 
                 map[i].ref = blkif_gref_from_fas(fas); 
                 map[i].flags = GNTMAP_host_map; 
@@ -381,27 +380,23 @@ 
         BUG_ON(ret); 
  
         for (i = 0; i < nseg; i++) { 
- if (likely(map[i].handle >= 0)) { 
- pending_handle(pending_idx, i) = map[i].handle; 
+ if (unlikely(map[i].handle < 0)) { 
+ DPRINTK("invalid buffer -- could not remap it\n"); 
+ goto fail_flush; 
+ } 
+ 
+ pending_handle(pending_req, i) = map[i].handle; 
 #ifdef __ia64__ 
- MMAP_VADDR(pending_idx,i) = gnttab_map_vaddr(map[i]); 
+ pending_vaddrs[vaddr_pagenr(req, seg)] = 
+ = gnttab_map_vaddr(map[i]); 
 #else 
- phys_to_machine_mapping[__pa(MMAP_VADDR( 
- pending_idx, i)) >> PAGE_SHIFT] = 
- FOREIGN_FRAME(map[i].dev_bus_addr>>PAGE_SHIFT); 
+ phys_to_machine_mapping[__pa(vaddr( 
+ pending_req, i)) >> PAGE_SHIFT] = 
+ FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT); 
 #endif 
- fas        = req->frame_and_sects[i]; 
- seg[i].buf = map[i].dev_bus_addr | 
- (blkif_first_sect(fas) << 9); 
- } else { 
- errors++; 
- } 
- } 
- 
- if (errors) { 
- DPRINTK("invalid buffer -- could not remap it\n"); 
- fast_flush_area(pending_idx, nseg); 
- goto bad_descriptor; 
+ fas         = req->frame_and_sects[i]; 
+ seg[i].buf  = map[i].dev_bus_addr | 
+ (blkif_first_sect(fas) << 9); 
         } 
  
         if (vbd_translate(&preq, blkif, operation) != 0) { 
@@ -409,37 +404,25 @@ 
                         operation == READ ? "read" : "write", 
                         preq.sector_number, 
                         preq.sector_number + preq.nr_sects, preq.dev); 
- goto bad_descriptor; 
- } 
- 
- pending_req = &pending_reqs[pending_idx]; 
- pending_req->blkif     = blkif; 
- pending_req->id        = req->id; 
- pending_req->operation = operation; 
- pending_req->status    = BLKIF_RSP_OKAY; 
- pending_req->nr_pages  = nseg; 
+ goto fail_flush; 
+ } 
  
         for (i = 0; i < nseg; i++) { 
                 if (((int)preq.sector_number|(int)seg[i].nsec) & 
                     ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) { 
                         DPRINTK("Misaligned I/O request from domain
%d", 
                                 blkif->domid); 
- goto cleanup_and_fail; 
+ goto fail_put_bio; 
                 } 
  
                 while ((bio == NULL) || 
                        (bio_add_page(bio, 
-     virt_to_page(MMAP_VADDR(pending_idx, i)), 
+     virt_to_page(vaddr(pending_req, i)), 
                                      seg[i].nsec << 9, 
                                      seg[i].buf & ~PAGE_MASK) == 0)) { 
                         bio = biolist[nbio++] = bio_alloc(GFP_KERNEL,
nseg-i); 
- if (unlikely(bio == NULL)) { 
- cleanup_and_fail: 
- for (i = 0; i < (nbio-1); i++) 
- bio_put(biolist[i]); 
- fast_flush_area(pending_idx, nseg); 
- goto bad_descriptor; 
- } 
+ if (unlikely(bio == NULL)) 
+ goto fail_put_bio; 
                  
                         bio->bi_bdev    = preq.bdev; 
                         bio->bi_private = pending_req; 
@@ -450,14 +433,8 @@ 
                 preq.sector_number += seg[i].nsec; 
         } 
  
- if ((q = bdev_get_queue(bio->bi_bdev)) != plugged_queue) { 
- flush_plugged_queue(); 
- blk_get_queue(q); 
- plugged_queue = q; 
- } 
- 
+ plug_queue(blkif, bio); 
         atomic_set(&pending_req->pendcnt, nbio); 
- pending_cons++; 
         blkif_get(blkif); 
  
         for (i = 0; i < nbio; i++) 
@@ -465,8 +442,14 @@ 
  
         return; 
  
- bad_descriptor: 
+ fail_put_bio: 
+ for (i = 0; i < (nbio-1); i++) 
+ bio_put(biolist[i]); 
+ fail_flush: 
+ fast_flush_area(pending_req); 
+ fail_response: 
         make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);

+ free_req(pending_req); 
 } 
  
  
@@ -498,56 +481,47 @@ 
         notify_remote_via_irq(blkif->irq); 
 } 
  
-void blkif_deschedule(blkif_t *blkif) 
-{ 
- remove_from_blkdev_list(blkif); 
-} 
- 
 static int __init blkif_init(void) 
 { 
+ struct page *page; 
         int i; 
- struct page *page; 
- int ret; 
- 
- for (i = 0; i < MMAP_PAGES; i++) 
- pending_grant_handles[i] = BLKBACK_INVALID_HANDLE; 
- 
- if (xen_init() < 0) 
- return -ENODEV; 
+ 
+ mmap_pages            = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST; 
+ pending_reqs          = kmalloc(sizeof(pending_reqs[0]) * 
+ blkif_reqs, GFP_KERNEL); 
+ pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) * 
+ mmap_pages, GFP_KERNEL); 
+ pending_vaddrs        = kmalloc(sizeof(pending_vaddrs[0]) * 
+ mmap_pages, GFP_KERNEL); 
+ if (!pending_reqs || !pending_grant_handles || !pending_vaddrs) { 
+ printk("%s: out of memory\n", __FUNCTION__); 
+ return -1; 
+ } 
  
         blkif_interface_init(); 
- 
+ 
 #ifdef __ia64__ 
-    { 
         extern unsigned long
alloc_empty_foreign_map_page_range(unsigned long pages); 
- int i; 
- 
- mmap_vstart =  alloc_empty_foreign_map_page_range(MMAP_PAGES); 
- printk("Allocated mmap_vstart: 0x%lx\n", mmap_vstart); 
- for(i = 0; i < MMAP_PAGES; i++) 
-    pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT); 
- BUG_ON(mmap_vstart == NULL); 
-    } 
-#else 
- page = balloon_alloc_empty_page_range(MMAP_PAGES); 
+ mmap_vstart = (unsigned
long)alloc_empty_foreign_map_page_range(mmap_pages); 
+#else /* ! ia64 */ 
+ page = balloon_alloc_empty_page_range(mmap_pages); 
         BUG_ON(page == NULL); 
         mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); 
 #endif 
- 
- pending_cons = 0; 
- pending_prod = MAX_PENDING_REQS; 
+ printk("%s: reqs=%d, pages=%d, mmap_vstart=0x%lx\n", 
+       __FUNCTION__, blkif_reqs, mmap_pages, mmap_vstart); 
+ BUG_ON(mmap_vstart == 0); 
+ for (i = 0; i < mmap_pages; i++) 
+ pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT); 
+ 
+ memset(pending_grant_handles,  BLKBACK_INVALID_HANDLE, mmap_pages); 
         memset(pending_reqs, 0, sizeof(pending_reqs)); 
- for (i = 0; i < MAX_PENDING_REQS; i++) 
- pending_ring[i] = i; 
+ INIT_LIST_HEAD(&pending_free); 
+ 
+ for (i = 0; i < blkif_reqs; i++) 
+ list_add_tail(&pending_reqs[i].free_list, &pending_free); 
      
- spin_lock_init(&blkio_schedule_list_lock); 
- INIT_LIST_HEAD(&blkio_schedule_list); 
- 
- ret = kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES); 
- BUG_ON(ret < 0); 
- 
         blkif_xenbus_init(); 
- 
         return 0; 
 } 
  
diff -r abbe3df33774 linux-2.6-xen-sparse/drivers/xen/blkback/common.h 
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/common.h Tue Nov  8
17:39:58 2005 
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/common.h Wed Nov  9
13:45:37 2005 
@@ -56,9 +56,19 @@ 
         /* Is this a blktap frontend */ 
         unsigned int     is_blktap; 
 #endif 
- struct list_head blkdev_list; 
         spinlock_t       blk_ring_lock; 
         atomic_t         refcnt; 
+ 
+ wait_queue_head_t   wq; 
+ struct task_struct  *xenblkd; 
+ atomic_t            io_pending; 
+ request_queue_t     *plug; 
+ 
+ /* statistics */ 
+ unsigned long       st_print; 
+ int                 st_rd_req; 
+ int                 st_wr_req; 
+ int                 st_oo_req; 
  
         struct work_struct free_work; 
  
@@ -97,11 +107,10 @@ 
  
 void blkif_interface_init(void); 
  
-void blkif_deschedule(blkif_t *blkif); 
- 
 void blkif_xenbus_init(void); 
  
 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs); 
+int blkif_schedule(void *arg); 
  
 #endif /* __BLKIF__BACKEND__COMMON_H__ */ 
  
diff -r abbe3df33774
linux-2.6-xen-sparse/drivers/xen/blkback/interface.c 
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c Tue Nov  8
17:39:58 2005 
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c Wed Nov  9
13:45:37 2005 
@@ -24,6 +24,8 @@ 
         blkif->status = DISCONNECTED; 
         spin_lock_init(&blkif->blk_ring_lock); 
         atomic_set(&blkif->refcnt, 1); 
+ init_waitqueue_head(&blkif->wq); 
+ blkif->st_print = jiffies; 
  
         return blkif; 
 } 
diff -r abbe3df33774 linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c 
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c Tue Nov  8
17:39:58 2005 
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c Wed Nov  9
13:45:37 2005 
@@ -17,6 +17,7 @@ 
 */ 
 #include <stdarg.h> 
 #include <linux/module.h> 
+#include <linux/kthread.h> 
 #include <asm-xen/xenbus.h> 
 #include "common.h" 
  
@@ -46,8 +47,11 @@ 
         if (be->watch.node) 
                 unregister_xenbus_watch(&be->watch); 
         unregister_xenbus_watch(&be->backend_watch); 
- if (be->blkif) 
+ if (be->blkif) { 
+ if (be->blkif->xenblkd) 
+ kthread_stop(be->blkif->xenblkd); 
                 blkif_put(be->blkif); 
+ } 
         if (be->frontpath) 
                 kfree(be->frontpath); 
         kfree(be); 
@@ -198,6 +202,16 @@ 
                         be->blkif = NULL; 
                         xenbus_dev_error(dev, err, 
                                          "creating vbd structure"); 
+ return; 
+ } 
+ 
+ be->blkif->xenblkd = kthread_run(blkif_schedule, be->blkif, 
+ "xenblkd %d/%04lx", 
+ be->blkif->domid, be->pdev); 
+ if (IS_ERR(be->blkif->xenblkd)) { 
+ err = PTR_ERR(be->blkif->xenblkd); 
+ be->blkif->xenblkd = NULL; 
+ xenbus_dev_error(dev, err, "start xenblkd"); 
                         return; 
                 } 
  
_______________________________________________ 
Xen-devel mailing list 
Xen-devel@... 
http://lists.xensource.com/xen-devel 

_______________________________________________
Xen-ia64-devel mailing list
Xen-ia64-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-ia64-devel

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-ia64-devel] Re: [Xen-devel] [patch] CFQ for xen domains, Magenheimer, Dan (HP Labs Fort Collins) <=