commit ff1902ee8462cf7d763a8fd42255a72f8e31bebe Author: Konrad Rzeszutek Wilk Date: Mon Apr 11 22:37:52 2011 -0400 blkback: Patch to fix unmapping pages while iSCSI still has references to them. Signed-off-by: Gary Grebus Signed-off-by: Joshua Nicholas [v1: Port from dom0-sources.4.5.16.11.tar.gz] [v2: Added a whole bunch of printks] [v3: Fixed merge error] Signed-off-by: Konrad Rzeszutek Wilk diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c index 0bef445..ab8d12d 100644 --- a/drivers/xen/blkback/blkback.c +++ b/drivers/xen/blkback/blkback.c @@ -81,7 +81,11 @@ typedef struct { atomic_t pendcnt; unsigned short operation; int status; - struct list_head free_list; + unsigned long delay_expire_time; + union { + struct list_head free_list; + struct list_head delayed_list; + }; } pending_req_t; static pending_req_t *pending_reqs; @@ -89,6 +93,10 @@ static struct list_head pending_free; static DEFINE_SPINLOCK(pending_free_lock); static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq); +static struct list_head delayed_reqs; +static DEFINE_SPINLOCK(delayed_reqs_lock); +static struct timer_list delayed_reqs_timer; + #define BLKBACK_INVALID_HANDLE (~0) static struct page **pending_pages; @@ -132,6 +140,8 @@ static pending_req_t* alloc_req(void) list_del(&req->free_list); } spin_unlock_irqrestore(&pending_free_lock, flags); + if (req != NULL) + req->delay_expire_time = 0; return req; } @@ -169,29 +179,160 @@ static void plug_queue(blkif_t *blkif, struct block_device *bdev) blkif->plug = q; } -static void fast_flush_area(pending_req_t *req) +static void delay_completion(pending_req_t *req) +{ + unsigned long flags; + pending_req_t *oldest; + unsigned long oldest_time; + + req->delay_expire_time = jiffies + (HZ / 64); + spin_lock_irqsave(&delayed_reqs_lock, flags); + list_add_tail(&req->delayed_list, &delayed_reqs); + + oldest = list_entry(delayed_reqs.next, pending_req_t, delayed_list); + oldest_time = oldest->delay_expire_time; + spin_unlock_irqrestore(&delayed_reqs_lock, flags); + + mod_timer(&delayed_reqs_timer, oldest_time); + printk(KERN_INFO "%s: domid: %d (irq:%u) id: %llx added on delayed list\n", + __func__, (unsigned int)req->blkif->domid, + req->blkif->irq, + req->id); +} + +static int fast_flush_area(pending_req_t *req) { struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; unsigned int i, invcount = 0; grant_handle_t handle; int ret; + struct page *pending_page; + struct page *new_page; + int pending_page_count; + int pending_page_mapcount; + + unsigned int pgidx; + unsigned int busy_pages = 0; + int delay_expired; + + /* Already waited for extra refs to clear on one or more pages? */ + delay_expired = req->delay_expire_time && + time_before(req->delay_expire_time, jiffies); + for (i = 0; i < req->nr_pages; i++) { handle = pending_handle(req, i); if (handle == BLKBACK_INVALID_HANDLE) continue; - blkback_pagemap_clear(pending_page(req, i)); - gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i), - GNTMAP_host_map, handle); - pending_handle(req, i) = BLKBACK_INVALID_HANDLE; - invcount++; + + pgidx = vaddr_pagenr(req, i); + pending_page = pending_pages[pgidx]; + pending_page_count = page_count(pending_page); + pending_page_mapcount = page_mapcount(pending_page); + + if (pending_page_count <= 1) { + if (unlikely(pending_page_mapcount > 0)) { // Used as a flag by netback lazy copying scheme, + reset_page_mapcount(pending_page); // but blocks gnttab_copy_grant_page() if set + } + blkback_pagemap_clear(pending_page(req, i)); + gnttab_set_unmap_op(&unmap[invcount++], vaddr(req, i), + GNTMAP_host_map, handle); + pending_handle(req, i) = BLKBACK_INVALID_HANDLE; + } else { + printk(KERN_INFO "domid: %d (irq:%u) id: %llx count: %u, mapcount: %u, delay %s, busy_pages:%d\n", + (unsigned int)req->blkif->domid, + req->blkif->irq, + req->id, + pending_page_count, + pending_page_mapcount, + delay_expired ? "expire" : "still ticking", + busy_pages); + if (!delay_expired) { + busy_pages++; + } else { + if (unlikely(pending_page_mapcount > 0)) { // Used as a flag by netback lazy copying scheme, + reset_page_mapcount(pending_page); // but blocks gnttab_copy_grant_page() if set + busy_pages++; + } else { + printk(KERN_INFO "domid: %d (irq:%u) id: %llx leaking page %d\n", + (unsigned int)req->blkif->domid, + req->blkif->irq, req->id, + pgidx); + new_page = pending_page; + ret = gnttab_copy_grant_page(handle, &new_page); // new dom0 page is returned + if ( ! ret ) { + pending_pages[pgidx] = new_page; + pending_handle(req, i) = BLKBACK_INVALID_HANDLE; + } else { + busy_pages++; + } + } + } + } } - ret = HYPERVISOR_grant_table_op( - GNTTABOP_unmap_grant_ref, unmap, invcount); - BUG_ON(ret); + if (invcount > 0) { + ret = HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, unmap, invcount); + BUG_ON(ret); + } + return busy_pages; } + +/* + * Completion of a request is delayed if one or more of the mapped guest + * pages still has a reference. This can happen if the block I/O gets + * turned into a network I/O for example with iSCSI. + */ +static void complete_delayed_reqs(unsigned long unused) +{ + pending_req_t *req; + pending_req_t *tmp; + unsigned long oldest_time = 0; + unsigned long flags; + unsigned long count = 0; + LIST_HEAD(work_list); + + if (list_empty(&delayed_reqs)) + return; + + /* Grab the whole list */ + spin_lock_irqsave(&delayed_reqs_lock, flags); + list_for_each_entry_safe(req, tmp, &delayed_reqs, delayed_list) { + count++; + list_move_tail(&req->delayed_list, &work_list); + } + spin_unlock_irqrestore(&delayed_reqs_lock, flags); + printk(KERN_INFO "%s: processing %ld requests.\n", __func__, count); + list_for_each_entry_safe(req, tmp, &work_list, delayed_list) { + printk(KERN_INFO "%s: domid: %d (irq:%u) id: %llx\n", __func__, + (unsigned int)req->blkif->domid, + req->blkif->irq, req->id); + if (unlikely(fast_flush_area(req))) { + spin_lock_irqsave(&delayed_reqs_lock, flags); + list_move_tail(&req->delayed_list, &delayed_reqs); + spin_unlock_irqrestore(&delayed_reqs_lock, flags); + } else { + list_del(&req->delayed_list); + make_response(req->blkif, req->id, + req->operation, req->status); + + blkif_put(req->blkif); + free_req(req); + } + } + + spin_lock_irqsave(&delayed_reqs_lock, flags); + if (! list_empty(&delayed_reqs)) { + tmp = list_entry(delayed_reqs.next, pending_req_t, + delayed_list); + oldest_time = tmp->delay_expire_time; + } + spin_unlock_irqrestore(&delayed_reqs_lock, flags); + if (oldest_time) + mod_timer(&delayed_reqs_timer, oldest_time); +} /****************************************************************** * SCHEDULER FUNCTIONS */ @@ -258,6 +399,10 @@ int blkif_schedule(void *arg) static void __end_block_io_op(pending_req_t *pending_req, int error) { + + + complete_delayed_reqs(0); /* Previously delayed reqs are usually done.*/ + /* An error fails the entire request. */ if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && (error == -EOPNOTSUPP)) { @@ -271,9 +416,15 @@ static void __end_block_io_op(pending_req_t *pending_req, int error) } if (atomic_dec_and_test(&pending_req->pendcnt)) { - fast_flush_area(pending_req); + if (unlikely(fast_flush_area(pending_req))) { + /* Granted page(s) are still being referenced */ + delay_completion(pending_req); + return; + } + make_response(pending_req->blkif, pending_req->id, pending_req->operation, pending_req->status); + blkif_put(pending_req->blkif); free_req(pending_req); } @@ -316,6 +467,7 @@ static int do_block_io_op(blkif_t *blkif) RING_IDX rc, rp; int more_to_do = 0; + rc = blk_rings->common.req_cons; rp = blk_rings->common.sring->req_prod; rmb(); /* Ensure we see queued requests up to 'rp'. */ @@ -337,6 +489,7 @@ static int do_block_io_op(blkif_t *blkif) break; } + switch (blkif->blk_protocol) { case BLKIF_PROTOCOL_NATIVE: memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req)); @@ -542,7 +695,13 @@ static void dispatch_rw_block_io(blkif_t *blkif, return; fail_flush: - fast_flush_area(pending_req); + if (unlikely(fast_flush_area(pending_req))) { + /* Granted page(s) are being referenced by a previous I/O */ + pending_req->status = BLKIF_RSP_ERROR; + delay_completion(pending_req); + msleep(1); /* back off a bit */ + return; + } fail_response: make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); free_req(pending_req); @@ -652,6 +811,12 @@ static int __init blkif_init(void) memset(pending_reqs, 0, sizeof(pending_reqs)); INIT_LIST_HEAD(&pending_free); + INIT_LIST_HEAD(&delayed_reqs); + spin_lock_init(&delayed_reqs_lock); + init_timer(&delayed_reqs_timer); + delayed_reqs_timer.data = 0; + delayed_reqs_timer.function = complete_delayed_reqs; + for (i = 0; i < blkif_reqs; i++) list_add_tail(&pending_reqs[i].free_list, &pending_free);