Signed-of-by: Kaushik Kumar Ram Signed-of-by: Jose Renato Santos # HG changeset patch # User Jose Renato Santos policy) { +#ifdef CONFIG_XEN_NETDEV2_VMQ + case transmit_policy_vmq: + prepare_xmit_vmq(nc, tp); + break; +#endif case transmit_policy_small: /* Nothing to do */ break; @@ -1728,6 +1738,9 @@ static void process_ring(struct netchann /* Pick up incoming messages. */ nc2_poll(ncrp); +#ifdef CONFIG_XEN_NETDEV2_VMQ + do_vmq_work(nc); +#endif /* Transmit pending packets. */ if (!list_empty(&ncrp->pending_tx_packets)) { list_for_each_entry_safe(tp, next_tp, diff -r 9f69a5d6b95e -r 9923186b736e drivers/xen/netchannel2/netback2.c --- a/drivers/xen/netchannel2/netback2.c Fri Dec 19 00:12:02 2008 -0800 +++ b/drivers/xen/netchannel2/netback2.c Fri Dec 19 18:46:00 2008 -0800 @@ -13,6 +13,13 @@ static struct netchannel2 *device_to_nc2 static struct netchannel2 *device_to_nc2(struct device *dev); #include "sysfs.c" + +#ifdef CONFIG_XEN_NETDEV2_VMQ +#include "vmq.h" +#define NR_TX_BUFS (VMQ_MAX_BUFFERS+256) +#else +#define NR_TX_BUFS 256 +#endif static atomic_t next_handle; /* A list of all currently-live netback2 interfaces. */ @@ -174,10 +181,22 @@ static int attach_to_frontend(struct net return err; } +#ifdef CONFIG_XEN_NETDEV2_VMQ + nc2_vmq_connect(nc); +#endif + /* All done */ nd->attached = 1; return 0; +} + +static void nb2_shutdown(struct netchannel2 *nc) +{ +#ifdef CONFIG_XEN_NETDEV2_VMQ + nc2_vmq_disconnect(nc); +#endif + nc2_set_nr_tx_buffers(nc, 0); } static void frontend_changed(struct xenbus_device *xd, @@ -197,7 +216,7 @@ static void frontend_changed(struct xenb * detached, and this is pointless but harmless.) */ detach_from_frontend(nb); - nc2_set_nr_tx_buffers(nb->chan, 256); + nc2_set_nr_tx_buffers(nb->chan, NR_TX_BUFS); /* Tell the frontend what sort of rings we're willing to accept. */ @@ -224,7 +243,7 @@ static void frontend_changed(struct xenb break; case XenbusStateClosing: - nc2_set_nr_tx_buffers(nb->chan, 0); + nb2_shutdown(nb->chan); detach_from_frontend(nb); xenbus_switch_state(xd, XenbusStateClosed); break; @@ -268,7 +287,7 @@ static void netback2_shutdown(struct xen static void netback2_shutdown(struct xenbus_device *xd) { struct netback2 *nb = xenbus_device_to_nb2(xd); - nc2_set_nr_tx_buffers(nb->chan, 0); + nb2_shutdown(nb->chan); xenbus_switch_state(xd, XenbusStateClosing); } diff -r 9f69a5d6b95e -r 9923186b736e drivers/xen/netchannel2/netchannel2_core.h --- a/drivers/xen/netchannel2/netchannel2_core.h Fri Dec 19 00:12:02 2008 -0800 +++ b/drivers/xen/netchannel2/netchannel2_core.h Fri Dec 19 18:46:00 2008 -0800 @@ -6,6 +6,8 @@ #include #include #include + +#include "vmq_def.h" #if 0 #define DEBUGMSG(x, ...) do { printk(KERN_NOTICE "%s:%s:%d " x "\n", __FILE__, __func__, __LINE__ , ## __VA_ARGS__ ); } while (0) @@ -48,7 +50,8 @@ enum transmit_policy { transmit_policy_post, transmit_policy_map, transmit_policy_small, - transmit_policy_last = transmit_policy_small + transmit_policy_vmq, + transmit_policy_last = transmit_policy_vmq }; /* Packets which we've sent but which haven't yet received a FINISHED @@ -126,6 +129,7 @@ struct nc2_tx_buffer { grant_ref_t gref; uint16_t off_in_page; uint16_t size; + grant_handle_t grant_handle; }; struct netchannel2_prod_ring { @@ -485,6 +489,11 @@ struct netchannel2 { struct net_device_stats stats; struct hypercall_batcher batcher; + +#ifdef CONFIG_XEN_NETDEV2_VMQ + /* vmq data for supporting multi-queue devices */ + nc2_vmq_t vmq; +#endif }; #define BYPASS_RING_PAGES 2 @@ -617,6 +626,39 @@ static inline void flush_hypercall_batch flush_prepared_grant_copies(hb, on_fail); } +static inline struct nc2_tx_buffer *_get_tx_buffer(struct netchannel2 *nc) +{ + struct nc2_tx_buffer *buffer; + struct list_head *entry = nc->avail_tx_buffers.next; + list_del(entry); + buffer = list_entry(entry, struct nc2_tx_buffer, list); + nc->nr_avail_tx_buffers--; + return buffer; +} + +/* recycle a posted buffer: return it to the list of available buffers */ +static inline void recycle_tx_buffer(struct netchannel2 *nc, + struct nc2_tx_buffer *buffer) +{ + list_add(&buffer->list, &nc->avail_tx_buffers); + nc->nr_avail_tx_buffers++; +} + +/* add a buffer to the pending list to be returned to the other end buffer */ +static inline void return_tx_buffer(struct netchannel2 *nc, + struct nc2_tx_buffer *buffer) +{ + list_add(&buffer->list, &nc->pending_tx_buffer_return); +} + +/* add a buffer slot to list of unused buffer slots after it has been + * returned to other end */ +static inline void free_tx_buffer(struct netchannel2 *nc, + struct nc2_tx_buffer *buffer) +{ + list_add(&buffer->list, &nc->unused_tx_buffer_slots); +} + struct sk_buff *handle_receiver_copy_packet(struct netchannel2 *nc, struct netchannel2_ring_pair *ncrp, struct netchannel2_msg_packet *msg, diff -r 9f69a5d6b95e -r 9923186b736e drivers/xen/netchannel2/posted_buffers.c --- a/drivers/xen/netchannel2/posted_buffers.c Fri Dec 19 00:12:02 2008 -0800 +++ b/drivers/xen/netchannel2/posted_buffers.c Fri Dec 19 18:46:00 2008 -0800 @@ -10,11 +10,11 @@ #include "netchannel2_endpoint.h" #include "netchannel2_core.h" +#ifdef CONFIG_XEN_NETDEV2_VMQ +#include "vmq.h" +#endif + #define POSTED_BUFFER_SIZE PAGE_SIZE - -/* No matter what the other end wants, we never post more than this - number of RX buffers to it. */ -#define MAX_POSTED_BUFFERS 256 /* A poison value to make certain buffer management errors more * obvious. */ @@ -730,6 +730,7 @@ void nc2_return_pending_posted_buffers(s txb = list_entry(nc->pending_tx_buffer_return.next, struct nc2_tx_buffer, list); + free_tx_buffer(nc, txb); msg.id = txb->id; nc2_send_message(&nc->rings.prod_ring, NETCHANNEL2_MSG_RETURN_POSTED_BUFFER, diff -r 9f69a5d6b95e -r 9923186b736e drivers/xen/netchannel2/util.c --- a/drivers/xen/netchannel2/util.c Fri Dec 19 00:12:02 2008 -0800 +++ b/drivers/xen/netchannel2/util.c Fri Dec 19 18:46:00 2008 -0800 @@ -356,6 +356,17 @@ struct transmitted_packet *allocate_tx_p } } +static inline void nc2_free_skb(struct transmitted_packet *tp) +{ +#ifdef CONFIG_XEN_NETDEV2_VMQ + nc2_vmq_t *vmq = &tp->ring_pair->interface->vmq; + if (tp->policy == transmit_policy_vmq ) + skb_queue_tail(&vmq->dealloc_queue, tp->skb); + else +#endif + dev_kfree_skb(tp->skb); +} + void release_tx_packet(struct netchannel2_ring_pair *ncrp, struct transmitted_packet *tp) { @@ -368,7 +379,7 @@ void release_tx_packet(struct netchannel BUG_ON(!tp->in_use); sanity_check_transmitted_packet(tp); if (tp->skb != NULL) { - dev_kfree_skb(tp->skb); + nc2_free_skb(tp); tp->skb = NULL; } if (tp->msg.type == NC2_PACKET_TYPE_receiver_copy) { diff -r 9f69a5d6b95e -r 9923186b736e drivers/xen/netchannel2/vmq.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/drivers/xen/netchannel2/vmq.c Fri Dec 19 18:46:00 2008 -0800 @@ -0,0 +1,791 @@ +/***************************************************************************** + * vmq.c + * + * Support multi-queue network devices. + * + * Copyright (c) 2008, Kaushik Kumar Ram, Rice University. + * Copyright (c) 2008, Jose Renato Santos, Hewlett-Packard Co. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include "netchannel2_core.h" + +#include "vmq.h" + +/* state of device queue when operating in vmq mode */ +#define VMQ_QUEUE_DISABLED 0 +#define VMQ_QUEUE_STARTING 1 +#define VMQ_QUEUE_ENABLED 2 +#define VMQ_QUEUE_CLOSING 3 + +static inline unsigned long vmq_idx_to_pfn(nc2_vmq_t *vmq, unsigned int idx) +{ + return page_to_pfn(vmq->pages[idx]); +} + +static inline unsigned long vmq_idx_to_kaddr(nc2_vmq_t *vmq, unsigned int idx) +{ + return (unsigned long)pfn_to_kaddr(vmq_idx_to_pfn(vmq, idx)); +} + +/* get vmq idx from page struct */ +static long nc2_vmq_page_index(struct page *page) +{ + nc2_vmq_buf_t *vmq_buf; + vmq_buf = (nc2_vmq_buf_t *)page->mapping; + return (vmq_buf - vmq_buf->nc->vmq.buffer); +} + +/* Read a physical device name from xenstore and + * returns a pointer to the associated net_device structure. + * Returns NULL on error. */ +static struct net_device *read_pdev(struct xenbus_device *dev) +{ + char *pdevstr; + struct net_device *pdev = NULL; + + pdevstr = xenbus_read(XBT_NIL, dev->nodename, "pdev", NULL); + if (IS_ERR(pdevstr)) + return NULL; + + if (pdevstr) { + pdev = dev_get_by_name(pdevstr); + } + + kfree(pdevstr); + + return pdev; +} + +static void nc2_vmq_page_release(struct page *page) +{ + printk("%s: ERROR: Unexpected release of netchannel2 vmq page", + __FUNCTION__); + BUG_ON(1); +} + +static inline int nc2_vmq_is_disabled(struct netchannel2 *nc) +{ + return (nc->vmq.vmq_state == VMQ_QUEUE_DISABLED); +} + +static inline int nc2_vmq_is_starting(struct netchannel2 *nc) +{ + return (nc->vmq.vmq_state == VMQ_QUEUE_STARTING); +} + +static inline int nc2_vmq_is_enabled(struct netchannel2 *nc) +{ + return (nc->vmq.vmq_state == VMQ_QUEUE_ENABLED); +} + +static inline int nc2_vmq_is_closing(struct netchannel2 *nc) +{ + return (nc->vmq.vmq_state == VMQ_QUEUE_CLOSING); +} + +static inline void nc2_vmq_enable(struct netchannel2 *nc) +{ + nc2_vmq_t *vmq = &nc->vmq; + vmq_get(vmq); + vmq_enable_queue(vmq->pdev, vmq->vmq_id); + vmq->vmq_state = VMQ_QUEUE_ENABLED; +} + +void nc2_vmq_disconnect(struct netchannel2 *nc) +{ + nc2_vmq_t *vmq = &nc->vmq; + + if ( nc2_vmq_is_enabled(nc) ) { + vmq_disable_queue(vmq->pdev, vmq->vmq_id); + vmq_free_queue(vmq->pdev, vmq->vmq_id); + vmq->vmq_state = VMQ_QUEUE_CLOSING; + /* wait until all buffers have been returned by dev driver */ + wait_event(vmq->waiting_to_free, + atomic_read(&vmq->refcnt) == 0); + return; + } + + if ( nc2_vmq_is_starting(nc) ) { + vmq_free_queue(vmq->pdev, vmq->vmq_id); + vmq->vmq_state = VMQ_QUEUE_CLOSING; + return; + } + +} + + +static void nc2_vmq_end_map_buffers(gnttab_map_grant_ref_t *mop, int count, + struct netchannel2 *nc, u16 *alloc_idx) +{ + int i, err; + u16 idx; + unsigned int prod; + nc2_vmq_t *vmq = &nc->vmq; + + prod = vmq->mapped_pages_prod; + + for (i = 0; i < count; i++) { + + idx = alloc_idx[i]; + + /* Check error status */ + err = mop->status; + if (likely(!err)) { + set_phys_to_machine( + __pa(vmq_idx_to_kaddr(vmq, idx)) + >> PAGE_SHIFT, + FOREIGN_FRAME(mop->dev_bus_addr + >> PAGE_SHIFT)); + + /* Store the handle */ + vmq->buffer[idx].buf->grant_handle = mop->handle; + + /* Add it to the mapped pages list */ + vmq->mapped_pages[VMQ_IDX_MASK(prod++)] = idx; + mop++; + continue; + } + + /* Error mapping page: return posted buffer to other end. + * TODO: We might need an error field on the return buffer + * message */ + return_tx_buffer(nc, vmq->buffer[idx].buf); + + /* Add the page back to the free list */ + vmq->unmapped_pages[VMQ_IDX_MASK(vmq->unmapped_pages_prod++)] + = idx; + + mop++; + } + + smp_wmb(); + vmq->mapped_pages_prod = prod; + + return; +} + +/* Map guest buffers and place them in the mapped buffers list. The mapped + * pages in this list are used when allocating a skb (vmq_alloc_skb()). + */ +static void nc2_vmq_map_buffers(struct netchannel2 *nc) +{ + u16 idx; + int count = 0; + unsigned int cons; + int nbufs; + int buf_avail; + struct nc2_tx_buffer *buf; + struct nc2_vmq *vmq = &nc->vmq; + int n_mapped = nr_vmq_bufs(nc); + + + /* + * Putting hundreds of bytes on the stack is considered rude. + * Static works because a tasklet can only be on one CPU at any time. + */ + static gnttab_map_grant_ref_t rx_map_ops[VMQ_MAX_BUFFERS]; + static u16 alloc_idx[VMQ_MAX_BUFFERS]; + + /* If there is at least VMQ_MIN_BUFFERS buffers, no work to do */ + if( n_mapped >= VMQ_MIN_BUFFERS) + return; + + /* Try to get VMQ_MAX_BUFFERS mapped buffers, if there are + sufficient buffers posted by the other end */ + nbufs = VMQ_MAX_BUFFERS - n_mapped; + buf_avail = nc->nr_avail_tx_buffers; + if (nbufs > buf_avail) + nbufs = buf_avail; + + /* Xen cannot handle more than 512 grant ops in a single hypercall */ + if (nbufs > 512) + nbufs = 512; + + /* give up if there are no buffers available */ + if (nbufs <= 0) + return; + + /* Note that we *should* have free pages to consume here + * and no checks are needed. + */ + cons = vmq->unmapped_pages_cons; + + while (count < nbufs) { + idx = vmq->unmapped_pages[VMQ_IDX_MASK(cons++)]; + buf = vmq->buffer[idx].buf = _get_tx_buffer(nc); + /* Setup grant map operation */ + gnttab_set_map_op(&rx_map_ops[count], + vmq_idx_to_kaddr(vmq, idx), + GNTMAP_host_map, + buf->gref, + nc->rings.otherend_id); + alloc_idx[count] = idx; + count++; + } + + vmq->unmapped_pages_cons = cons; + + /* Map all the pages */ + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, + rx_map_ops, nbufs)); + + /* Finalize buffer mapping after checking if the grant operations + succeeded */ + nc2_vmq_end_map_buffers(rx_map_ops, nbufs, nc, alloc_idx); + + vmq->nbufs += nbufs; +} + +static void nc2_vmq_unmap_buf(struct netchannel2 *nc, + unsigned int idx, int recycle) +{ + nc2_vmq_t *vmq = &nc->vmq; + unsigned long pfn; + gnttab_unmap_grant_ref_t gop; + unsigned prod; + int ret; + + pfn = vmq_idx_to_pfn(vmq, idx); + /* Already unmapped? */ + if (!phys_to_machine_mapping_valid(pfn)) + return; + gnttab_set_unmap_op(&gop, vmq_idx_to_kaddr(vmq, idx), + GNTMAP_host_map, + vmq->buffer[idx].buf->grant_handle); + ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &gop, 1); + BUG_ON(ret); + + vmq->nbufs--; + + set_phys_to_machine(__pa(vmq_idx_to_kaddr(vmq, idx)) >> + PAGE_SHIFT, + INVALID_P2M_ENTRY); + /* Ready for next use. */ + gnttab_reset_grant_page(vmq->pages[idx]); + /* Add the page back to the unmapped list */ + prod = vmq->unmapped_pages_prod; + vmq->unmapped_pages[VMQ_IDX_MASK(prod++)] = idx; + if (recycle) + recycle_tx_buffer(nc, vmq->buffer[idx].buf); + else + free_tx_buffer(nc, vmq->buffer[idx].buf); + smp_wmb(); + vmq->unmapped_pages_prod = prod; +} + +static void nc2_vmq_free_mapped_bufs(struct netchannel2 *nc) +{ + nc2_vmq_t *vmq = &nc->vmq; + unsigned int idx; + unsigned prod, cons; + + /* The queue should be disabled before this function is called */ + BUG_ON(vmq->vmq_state == VMQ_QUEUE_ENABLED); + + cons = vmq->mapped_pages_cons; + prod = vmq->mapped_pages_prod; + smp_rmb(); + + while(cons != prod) { + idx = vmq->mapped_pages[VMQ_IDX_MASK(cons++)]; + nc2_vmq_unmap_buf(nc, idx, 1); + } + + vmq->mapped_pages_cons = cons; + +} + +static void nc2_vmq_free_skb(struct sk_buff *skb) +{ + struct netchannel2 *nc; + nc2_vmq_t *vmq; + unsigned int idx; + int nr_frags, i; + struct skb_shared_info *shinfo = skb_shinfo(skb); + skb_frag_t *frags = shinfo->frags; + + nc = netdev_priv(skb->dev); + vmq = &nc->vmq; + + nr_frags = shinfo->nr_frags; + for (i = 0; i < nr_frags; i++) { + idx = nc2_vmq_page_index(frags[i].page); + nc2_vmq_unmap_buf(nc, idx, 1); + } + + shinfo->frag_list = NULL; + shinfo->nr_frags = 0; + + /* Add the skb back to the free pool */ + skb_queue_tail(&vmq->free_skb_list, skb); +} + +/* Initialize the free socket buffer list */ +static int vmq_init_free_skb_list(int n, struct sk_buff_head *free_skb_list) { + int i; + struct sk_buff *skb; + + skb_queue_head_init(free_skb_list); + + for(i = 0; i < n; i++) { + skb = alloc_skb(VMQ_SKB_SIZE, GFP_ATOMIC); + if(!skb) { + printk("Netchannel2 vmq: Failed to allocate socket " + "buffer %d (max=%d)\n", i,(int)n); + goto error; + } + skb_queue_tail(free_skb_list, skb); + } + + return 0; +error: + /* Free all the allocated buffers and return Error */ + while(!skb_queue_empty(free_skb_list)) { + kfree_skb(skb_dequeue(free_skb_list)); + } + + return -1; +} + +/* Initialize vmq. Return 1 if vmq is used and 0 otherwise */ +int nc2_vmq_connect(struct netchannel2 *nc) +{ + nc2_vmq_t *vmq = &nc->vmq; + struct page *page; + int q_id; + int size; + int i; + + vmq->vmq_mode = 0; + vmq->pdev = read_pdev(nc->xenbus_device); + + /* cannot use vmq mode if physical device not found */ + if (!vmq->pdev) + return 0; + + /* Allocate a RX queue */ + if((q_id = vmq_alloc_queue(vmq->pdev, VMQ_TYPE_RX)) < 0) + /* Allocation failed, cannot use multi-queue */ + goto free_pdev; + + vmq->vmq_id = q_id; + + /* Set the size of the queue */ + size = vmq_get_maxsize(vmq->pdev); + if (size > VMQ_QUEUE_SIZE) + size = VMQ_QUEUE_SIZE; + if(vmq_set_size(vmq->pdev, q_id, size) < 0) { + /* Failure, free up the queue and return error */ + printk("%s: could not set queue size on net device\n", + __FUNCTION__); + goto free_queue; + } + vmq->vmq_size = size; + + /* Set the mac address of the queue */ + if(vmq_set_mac(vmq->pdev, q_id, nc->rings.remote_mac) < 0) { + /* Failure, free up the queue and return error */ + printk("%s: could not set MAC address for net device queue\n", + __FUNCTION__); + goto free_queue; + } + + vmq->pages = alloc_empty_pages_and_pagevec(VMQ_MAX_BUFFERS); + if (vmq->pages == NULL) { + printk("%s: out of memory\n", __FUNCTION__); + goto free_queue; + } + + skb_queue_head_init(&vmq->dealloc_queue); + skb_queue_head_init(&vmq->rx_queue); + + if(vmq_init_free_skb_list(VMQ_MAX_BUFFERS, + &vmq->free_skb_list)) { + printk("%s: Could not allocate free socket buffers", + __FUNCTION__); + goto free_pagevec; + } + + for (i = 0; i < VMQ_MAX_BUFFERS; i++) { + vmq->buffer[i].nc = nc; + page = vmq->pages[i]; + SetPageForeign(page, nc2_vmq_page_release); + page->mapping = (void *)&vmq->buffer[i]; + vmq->unmapped_pages[i] = i; + } + + vmq->unmapped_pages_prod = VMQ_MAX_BUFFERS; + vmq->unmapped_pages_cons = 0; + + vmq->mapped_pages_prod = 0; + vmq->mapped_pages_cons = 0; + + vmq->nbufs = 0; + vmq->vmq_mode = 1; + + /* Store the pointer to netchannel2 device in pdev */ + BUG_ON((vmq->pdev->vmq == NULL) || (vmq->pdev->vmq->queue == NULL)); + vmq->pdev->vmq->queue[q_id].guest = (void*) nc->net_device; + + atomic_set(&vmq->refcnt, 0); + init_waitqueue_head(&vmq->waiting_to_free); + + printk(KERN_INFO "Netchannel2 using vmq mode for guest %d\n", + nc->xenbus_device->otherend_id); + + vmq->vmq_state = VMQ_QUEUE_STARTING; + + return 1; /* Success */ + + +free_pagevec: + free_empty_pages_and_pagevec(vmq->pages, VMQ_MAX_BUFFERS); +free_queue: + vmq_free_queue(vmq->pdev, vmq->vmq_id); +free_pdev: + dev_put(vmq->pdev); + vmq->pdev = NULL; + return 0; + +} + +void nc2_vmq_shutdown(struct netchannel2 *nc) +{ + nc2_vmq_t *vmq = &nc->vmq; + int i; + + if (!vmq->vmq_mode) + return; + + /* All posted bufs should have been returned */ + BUG_ON(nr_vmq_bufs(nc) != nr_vmq_mapped_bufs(nc)); + + /* free the mapped bufs */ + nc2_vmq_free_mapped_bufs(nc); + + /* Free the vmq pages */ + if (vmq->pages) { + for (i = 0; i < VMQ_MAX_BUFFERS; i++) { + if (PageForeign(vmq->pages[i])) + ClearPageForeign(vmq->pages[i]); + vmq->pages[i]->mapping = NULL; + } + free_empty_pages_and_pagevec(vmq->pages, VMQ_MAX_BUFFERS); + vmq->pages = NULL; + } + + while(!skb_queue_empty(&vmq->free_skb_list)) { + /* Free the socket buffer pool */ + kfree_skb(skb_dequeue(&vmq->free_skb_list)); + } + vmq->vmq_state = VMQ_QUEUE_DISABLED; + vmq->vmq_mode = 0; + + if (vmq->pdev) { + dev_put(vmq->pdev); + vmq->pdev = NULL; + } + + vmq_put(vmq); +} + +void do_vmq_work(struct netchannel2 *nc) +{ + nc2_vmq_t *vmq = &nc->vmq; + + struct transmitted_packet *tp; + struct sk_buff *skb; + unsigned long flags; + + /* if not in vmq mode do nothing */ + if (!nc2_in_vmq_mode(nc)) + return; + + /* Map guest buffers for dedicated NIC RX queue if needed */ + if (nr_vmq_bufs(nc) < VMQ_MIN_BUFFERS) { + nc2_vmq_map_buffers(nc); + /* We delay enabling the queue until we have enough + posted buffers. Check if it is time to enable it */ + if (nc2_vmq_is_starting(nc) && + (nr_vmq_bufs(nc) >= VMQ_MIN_BUFFERS)) { + nc2_vmq_enable(nc); + } + } + + /* free vmq skb's returned by the physical device driver */ + while(!skb_queue_empty(&nc->vmq.dealloc_queue)) { + nc2_vmq_free_skb(skb_dequeue(&nc->vmq.dealloc_queue)); + } + + /* complete vmq closing after all packets returned by physical + * device driver */ + + if (nc2_vmq_is_closing(nc) && + (nr_vmq_bufs(nc) == nr_vmq_mapped_bufs(nc))) { + nc->vmq.vmq_state = VMQ_QUEUE_DISABLED; + nc2_vmq_shutdown(nc); + } + + spin_lock_irqsave(&vmq->rx_queue.lock, flags); + while(!skb_queue_empty(&vmq->rx_queue)) { + skb = __skb_dequeue(&nc->vmq.rx_queue); + tp = prepare_xmit_allocate_vmq(nc, skb); + if (tp == NULL) { + __skb_queue_head(&vmq->rx_queue, skb); + spin_unlock_irqrestore(&vmq->rx_queue.lock,flags); + return; + } + queue_packet_to_interface(tp, &nc->rings); + } + spin_unlock_irqrestore(&vmq->rx_queue.lock,flags); +} + +/* Return the netchannel2 device corresponding to the given queue in pdev */ +static inline struct net_device *nc2_vmq_queue_to_vif(struct net_device *pdev, + int queue_id) +{ + net_vmq_t *n_vmq; + vmq_queue_t *vmq_q; + + n_vmq = pdev->vmq; + BUG_ON(n_vmq == NULL); + vmq_q = &n_vmq->queue[queue_id]; + BUG_ON(vmq_q == NULL); + + return (struct net_device*)vmq_q->guest; +} + +/* Handle incoming vmq packet */ +int vmq_netif_rx(struct sk_buff *skb, int queue_id) +{ + struct net_device *dev; + struct netchannel2 *nc; + nc2_vmq_t *vmq; + + /* get the netchannel2 interface corresponding to this queue */ + dev = nc2_vmq_queue_to_vif(skb->dev, queue_id); + nc = netdev_priv(dev); + vmq = &nc->vmq; + + /* replace source dev with destination dev */ + skb->dev = dev; + /* add skb to rx_queue */ + skb_queue_tail(&vmq->rx_queue, skb); + + /* Trigger thread excution to procees new packets */ + nc2_kick(&nc->rings); + + return 0; +} + + +/* Allocate a socket buffer from the free list, get a guest posted + * buffer, attach it to the skb, and return it. + */ +struct sk_buff *vmq_alloc_skb(struct net_device *netdevice, int queue_id, + unsigned int length) +{ + struct sk_buff *skb; + struct netchannel2 *nc; + nc2_vmq_t *vmq; + unsigned int idx; + int nr_bufs, i; + unsigned int cons; + unsigned int prod; + + /* get the netchannel2 interface corresponding to this queue */ + nc = netdev_priv(nc2_vmq_queue_to_vif(netdevice, queue_id)); + + vmq = &nc->vmq; + + /* Get a free buffer from the pool */ + if(skb_queue_empty(&vmq->free_skb_list)) { + /* No buffers to allocate */ + return NULL; + } + + + skb = skb_dequeue(&vmq->free_skb_list); + BUG_ON(skb == NULL); + + nr_bufs = VMQ_NUM_BUFFERS(length); + + cons = vmq->mapped_pages_cons; + prod = vmq->mapped_pages_prod; + smp_rmb(); + + if(nr_bufs > (prod - cons)) + /* Not enough mapped buffers in the pool */ + goto kick_nc2; + + if(nr_bufs > MAX_SKB_FRAGS) + goto error; + + for(i = 0; i < nr_bufs; i++) { + idx = vmq->mapped_pages[VMQ_IDX_MASK(cons)]; + /* FIX ME: This can be simplified */ + skb_shinfo(skb)->frags[i].page = + virt_to_page(vmq_idx_to_kaddr(vmq,idx)); + skb_shinfo(skb)->frags[i].page_offset = 0; + skb_shinfo(skb)->frags[i].size = PAGE_SIZE; + skb_shinfo(skb)->nr_frags++; + cons++; + } + + vmq->mapped_pages_cons = cons; + + /* if number of buffers get low run tasklet to map more buffers */ + if (nr_vmq_bufs(nc) < VMQ_MIN_BUFFERS) + nc2_kick(&nc->rings); + + return skb; + +kick_nc2: + /* kick netchannel2 interface to get any recently posted buffers */ + nc2_kick(&nc->rings); +error: + /* Add the skb back to the free pool */ + skb_queue_tail(&vmq->free_skb_list, skb); + return NULL; +} + +/* Detach the guest pages and free the socket buffer */ +void vmq_free_skb(struct sk_buff *skb, int queue_id) +{ + struct net_device *dev; + struct netchannel2 *nc; + nc2_vmq_t *vmq; + + /* get the netchannel2 interface corresponding to this queue */ + dev = nc2_vmq_queue_to_vif(skb->dev, queue_id); + + nc = netdev_priv(dev); + vmq = &nc->vmq; + + /* Add skb to the dealloc queue */ + skb->dev = dev; + skb_queue_tail(&vmq->dealloc_queue, skb); + + /* kick netchannel2 interface */ + nc2_kick(&nc->rings); + +} + +int nc2_is_vmq_packet(struct netchannel2 *nc, struct sk_buff *skb) +{ + int nr_frags; + long idx; + nc2_vmq_t *vmq = &nc->vmq; + + nr_frags = skb_shinfo(skb)->nr_frags; + if (vmq->vmq_mode && nr_frags && + PageForeign(skb_shinfo(skb)->frags[0].page)) { + idx = nc2_vmq_page_index(skb_shinfo(skb)->frags[0].page); + if ( (idx >= 0) && (idx < VMQ_MAX_BUFFERS) ) + return 1; + } + + return 0; +} + +struct transmitted_packet *prepare_xmit_allocate_vmq(struct netchannel2 *nc, + struct sk_buff *skb) +{ + struct transmitted_packet *tp; + int nr_fragments; + unsigned msg_size; + + spin_assert_locked(&nc->rings.lock); + + nr_fragments = skb_shinfo(skb)->nr_frags; + tp = allocate_tx_packet(&nc->rings, nr_fragments); + + if (tp == NULL) { + + DEBUGMSG("Failed to prepare/allocate packet.\n"); + RETURN(NULL); + } + + tp->policy = transmit_policy_vmq; + + sanity_check_transmitted_packet(tp); + + msg_size = get_transmitted_packet_msg_size(tp); + + if (nc2_reserve_payload_bytes(&nc->rings.prod_ring, msg_size)) { + DEBUGMSG("tp %p", tp); + tp->skb = skb; + RETURN(tp); + } + + nc->tx.nr_failed_reserve_ring++; + DEBUGMSG("Not enough space on ring (need %d)", msg_size); + release_tx_packet(&nc->rings, tp); + RETURN(NULL); +} + +/* Prepare to transmit a vmq packet */ +void prepare_xmit_vmq(struct netchannel2 *nc, struct transmitted_packet *tp) +{ + nc2_vmq_t *vmq = &nc->vmq; + struct sk_buff *skb = tp->skb; + skb_frag_t *frag; + struct transmitted_fragment *txf; + struct nc2_tx_buffer *txbuf; + int nr_frags; + unsigned int idx; + + nr_frags = skb_shinfo(skb)->nr_frags; + + tp->nr_fragments = nr_frags; + tp->inline_prefix_size = 0; + + tp->msg.type = NC2_PACKET_TYPE_pre_posted; + tp->msg.prefix_size = 0; + + frag = skb_shinfo(skb)->frags; + list_for_each_entry(txf, &tp->fragments, list) { + idx = nc2_vmq_page_index(frag->page); + txbuf = vmq->buffer[idx].buf; + txf->frag.pre_post.id = txbuf->id; + txf->frag.off = frag->page_offset; + txf->frag.size = frag->size; + /* TODO: need to batch unmap grants */ + nc2_vmq_unmap_buf(nc, idx, 0); + frag++; + } + + /* Avoid unmapping frags grants when skb is freed later */ + /* by nc2_vmq_fre_skb() */ + skb_shinfo(skb)->nr_frags = 0; +} diff -r 9f69a5d6b95e -r 9923186b736e drivers/xen/netchannel2/vmq.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/drivers/xen/netchannel2/vmq.h Fri Dec 19 18:46:00 2008 -0800 @@ -0,0 +1,40 @@ +#ifndef VMQ_H__ +#define VMQ_H__ + +#include "netchannel2_core.h" + +int nc2_vmq_connect(struct netchannel2 *nc); +void nc2_vmq_disconnect(struct netchannel2 *nc); +void do_vmq_work(struct netchannel2 *nc); +int nc2_is_vmq_packet(struct netchannel2 *nc, struct sk_buff *skb); +struct transmitted_packet *prepare_xmit_allocate_vmq(struct netchannel2 *nc, + struct sk_buff *skb); +void prepare_xmit_vmq(struct netchannel2 *nc, struct transmitted_packet *tp); + +#define vmq_get(_b) \ + atomic_inc(&(_b)->refcnt); + +#define vmq_put(_b) \ + do { \ + if ( atomic_dec_and_test(&(_b)->refcnt) ) { \ + wake_up(&(_b)->waiting_to_free); \ + } \ + } while (0) + +static inline int nr_vmq_mapped_bufs(struct netchannel2 *nc) +{ + return nc->vmq.mapped_pages_prod - + nc->vmq.mapped_pages_cons; +} + +static inline int nr_vmq_bufs(struct netchannel2 *nc) +{ + return nc->vmq.nbufs; +} + +static inline int nc2_in_vmq_mode(struct netchannel2 *nc) +{ + return nc->vmq.vmq_mode; +} + +#endif /* !VMQ_H__ */ diff -r 9f69a5d6b95e -r 9923186b736e drivers/xen/netchannel2/vmq_def.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/drivers/xen/netchannel2/vmq_def.h Fri Dec 19 18:46:00 2008 -0800 @@ -0,0 +1,72 @@ +#ifndef VMQ_DEF_H__ +#define VMQ_DEF_H__ + + +/* No matter what the other end wants, we never post more than this + number of RX buffers to it. */ +#define MAX_POSTED_BUFFERS 2048+256 + +/* size of HW queue in VMQ device */ +#define VMQ_QUEUE_SIZE 1024 + +/* Mimimum amount of buffers needed for VMQ + * This is the lower water mark that triggers mapping more guest buffers + * Should be larger than the queue size to allow for in flight packets + */ +#define VMQ_MIN_BUFFERS 1920 + +/* Maximum amount of posted buffers which are reserved for VMQ + * Should be less than MAX_POSTED_BUFFERS. For now, the difference can be used + * for intra-node guest to guest traffic. When we map guest buffers we try to + * have VMQ_MAX_BUFFERS mapped. The difference (VMQ_MAX_BUFFERS-VMQ_MIN_BUFFERS) + * helps batch multiple grant map operattions + * VMQ_QUEUE_SIZE < VMQ_MIN_BUFFER < VMQ_MAX_BUFFER < MAX_POSTED_BUFFERS + * VMQ_MAX_BUFFERS must be a power of 2 + */ +#define VMQ_MAX_BUFFERS 2048 + +/* skb size is zero since packet data uses fragments */ +#define VMQ_SKB_SIZE 0 + +#define VMQ_NUM_BUFFERS(len) ((len + PAGE_SIZE - 1) / PAGE_SIZE) + +#define VMQ_IDX_MASK(_i) ((_i)&(VMQ_MAX_BUFFERS-1)) + +typedef struct nc2_vmq_buf { + struct nc2_tx_buffer *buf; + struct netchannel2 *nc; +} nc2_vmq_buf_t; + +typedef struct nc2_vmq { + struct net_device *pdev; /* Pointer to physical device */ + int vmq_mode; /* indicate if vif is in vmq mode */ + struct page **pages; /* pages for mapping guest RX bufs */ + struct sk_buff_head free_skb_list; /* Free socket buffer pool */ + struct sk_buff_head dealloc_queue; /* list of skb's to be free */ + struct sk_buff_head rx_queue; /* list of received packets */ + + /* guest mapped buffers */ + nc2_vmq_buf_t buffer[VMQ_MAX_BUFFERS]; + + /* Ring with free pages available for mapping guest RX buffers */ + u16 unmapped_pages[VMQ_MAX_BUFFERS]; + unsigned int unmapped_pages_prod; + unsigned int unmapped_pages_cons; + + /* Ring of mapped RX pages avaialable for vmq device */ + u16 mapped_pages[VMQ_MAX_BUFFERS]; + unsigned int mapped_pages_prod; + unsigned int mapped_pages_cons; + + unsigned int nbufs; /* number of vmq buffers: posted to */ + /* HW queue or available to be posted */ + int vmq_id; /* Queue id */ + int vmq_size; /* Queue size */ + int vmq_state; /* queue stste */ + + atomic_t refcnt; + wait_queue_head_t waiting_to_free; + +} nc2_vmq_t; + +#endif /* !VMQ_DEF_H__ */