[Xen-devel] [PATCH 21/22] NC2 VMQ support.

This only includes the transmit half, because the receiver uses an
unmodified posted buffers mode implementation.

This includes various bits of patches which were

Signed-off-by: Jose Renato Santos <jsantos@xxxxxxxxxx>
Signed-off-by: Mitch Williams <mitch.a.williams@xxxxxxxxx>
Signed-off-by: Steven Smith <steven.smith@xxxxxxxxxx>

All bugs are mine, of course.
---
 drivers/xen/Kconfig                        |    5 +
 drivers/xen/netchannel2/Makefile           |    4 +
 drivers/xen/netchannel2/chan.c             |    7 +-
 drivers/xen/netchannel2/netback2.c         |    9 +
 drivers/xen/netchannel2/netchannel2_core.h |   10 +
 drivers/xen/netchannel2/posted_buffer.h    |   50 ++
 drivers/xen/netchannel2/posted_buffers.c   |   20 +-
 drivers/xen/netchannel2/util.c             |    8 +-
 drivers/xen/netchannel2/vmq.c              |  805 ++++++++++++++++++++++++++++
 drivers/xen/netchannel2/vmq.h              |   58 ++
 drivers/xen/netchannel2/vmq_def.h          |   68 +++
 drivers/xen/netchannel2/xmit_packet.c      |    6 +
 12 files changed, 1029 insertions(+), 21 deletions(-)
 create mode 100644 drivers/xen/netchannel2/posted_buffer.h
 create mode 100644 drivers/xen/netchannel2/vmq.c
 create mode 100644 drivers/xen/netchannel2/vmq.h
 create mode 100644 drivers/xen/netchannel2/vmq_def.h

diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index a7e5b5c..a37b0cd 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -234,6 +234,11 @@ config XEN_NETDEV2_FRONTEND
        depends on XEN_NETCHANNEL2
        default y
 
+config XEN_NETDEV2_VMQ
+       bool "Net channel 2 support for multi-queue devices"
+       depends on XEN_NETDEV2_BACKEND && NET_VMQ
+       default y
+
 config XEN_NETDEV2_BYPASSABLE
        bool "Net channel 2 bypassee support"
        depends on XEN_NETDEV2_BACKEND
diff --git a/drivers/xen/netchannel2/Makefile b/drivers/xen/netchannel2/Makefile
index 11a257e..918d8d8 100644
--- a/drivers/xen/netchannel2/Makefile
+++ b/drivers/xen/netchannel2/Makefile
@@ -12,6 +12,10 @@ ifeq ($(CONFIG_XEN_NETDEV2_FRONTEND),y)
 netchannel2-objs += netfront2.o
 endif
 
+ifeq ($(CONFIG_XEN_NETDEV2_VMQ),y)
+netchannel2-objs += vmq.o
+endif
+
 ifeq ($(CONFIG_XEN_NETDEV2_BYPASSABLE),y)
 netchannel2-objs += bypassee.o
 endif
diff --git a/drivers/xen/netchannel2/chan.c b/drivers/xen/netchannel2/chan.c
index 060b49b..8dad6fe 100644
--- a/drivers/xen/netchannel2/chan.c
+++ b/drivers/xen/netchannel2/chan.c
@@ -13,6 +13,7 @@
 
 #include "netchannel2_endpoint.h"
 #include "netchannel2_core.h"
+#include "vmq.h"
 
 static int process_ring(struct napi_struct *napi,
                        int work_avail);
@@ -810,6 +811,8 @@ static int process_ring(struct napi_struct *napi,
        /* Pick up incoming messages. */
        work_done = nc2_poll(ncrp, work_avail, &rx_queue);
 
+       do_vmq_work(nc);
+
        /* Transmit pending packets. */
        if (!skb_queue_empty(&ncrp->pending_tx_queue)) {
                skb = __skb_dequeue(&ncrp->pending_tx_queue);
@@ -828,9 +831,11 @@ static int process_ring(struct napi_struct *napi,
                   This must happen before we flush the rings, since
                   that's when the PACKET messages will be made
                   visible to the other end. */
-               if (ncrp == &nc->rings)
+               if (ncrp == &nc->rings) {
                        flush_hypercall_batcher(&nc->batcher,
                                                nc2_posted_on_gntcopy_fail);
+                       vmq_flush_unmap_hypercall();
+               }
 
                flush_rings(ncrp);
 
diff --git a/drivers/xen/netchannel2/netback2.c 
b/drivers/xen/netchannel2/netback2.c
index 129ef81..eb2a781 100644
--- a/drivers/xen/netchannel2/netback2.c
+++ b/drivers/xen/netchannel2/netback2.c
@@ -10,8 +10,13 @@
 #include "netchannel2_core.h"
 #include "netchannel2_endpoint.h"
 #include "netchannel2_uspace.h"
+#include "vmq.h"
 
+#ifdef CONFIG_XEN_NETDEV2_VMQ
+#define NR_TX_BUFS (VMQ_MAX_BUFFERS+256)
+#else
 #define NR_TX_BUFS 256
+#endif
 
 static atomic_t next_handle;
 /* A list of all currently-live netback2 interfaces. */
@@ -168,6 +173,8 @@ static int attach_to_frontend(struct netback2 *nd)
                return err;
        }
 
+       nc2_vmq_connect(nc);
+
        /* All done */
        nd->attached = 1;
 
@@ -176,6 +183,8 @@ static int attach_to_frontend(struct netback2 *nd)
 
 static void nb2_shutdown(struct netchannel2 *nc)
 {
+       nc2_vmq_disconnect(nc);
+
        nc2_set_nr_tx_buffers(nc, 0);
 }
 
diff --git a/drivers/xen/netchannel2/netchannel2_core.h 
b/drivers/xen/netchannel2/netchannel2_core.h
index 1939cbb..8e1657d 100644
--- a/drivers/xen/netchannel2/netchannel2_core.h
+++ b/drivers/xen/netchannel2/netchannel2_core.h
@@ -7,6 +7,8 @@
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 
+#include "vmq_def.h"
+
 /* After we send this number of frags, we request the other end to
  * notify us when sending the corresponding finish packet message */
 #define MAX_MAX_COUNT_FRAGS_NO_EVENT 192
@@ -43,6 +45,9 @@ enum transmit_policy {
        transmit_policy_grant = transmit_policy_first,
        transmit_policy_post,
        transmit_policy_map,
+#ifdef CONFIG_XEN_NETDEV2_VMQ
+       transmit_policy_vmq,
+#endif
        transmit_policy_small,
        transmit_policy_last = transmit_policy_small
 };
@@ -437,6 +442,11 @@ struct netchannel2 {
 
        struct hypercall_batcher batcher;
 
+#ifdef CONFIG_XEN_NETDEV2_VMQ
+       /* vmq data for supporting multi-queue devices */
+       nc2_vmq_t vmq;
+#endif
+
 #ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS
        struct nc2_auto_bypass auto_bypass;
 #endif
diff --git a/drivers/xen/netchannel2/posted_buffer.h 
b/drivers/xen/netchannel2/posted_buffer.h
new file mode 100644
index 0000000..e249777
--- /dev/null
+++ b/drivers/xen/netchannel2/posted_buffer.h
@@ -0,0 +1,50 @@
+/* Buffer management related bits, shared between vmq.c and
+ * posted_buffer.c */
+#ifndef NC2_POSTED_BUFFER_H__
+#define NC2_POSTED_BUFFER_H__
+
+/* A buffer which the other end has provided us which we can use to
+   transmit packets to it. */
+struct nc2_tx_buffer {
+       struct list_head list;
+       uint32_t id; /* ID assigned by the remote endpoint. */
+       grant_ref_t gref;
+       uint16_t off_in_page;
+       uint16_t size;
+       grant_handle_t grant_handle;
+};
+
+/* add a buffer to the pending list to be returned to the other end buffer */
+static inline void return_tx_buffer(struct netchannel2 *nc,
+                                   struct nc2_tx_buffer *buffer)
+{
+       list_add(&buffer->list, &nc->pending_tx_buffer_return);
+}
+
+static inline struct nc2_tx_buffer *_get_tx_buffer(struct netchannel2 *nc)
+{
+       struct nc2_tx_buffer *buffer;
+       struct list_head *entry = nc->avail_tx_buffers.next;
+       list_del(entry);
+       buffer = list_entry(entry, struct nc2_tx_buffer, list);
+       nc->nr_avail_tx_buffers--;
+       return buffer;
+}
+
+/* recycle a posted buffer: return it to the list of available buffers */
+static inline void recycle_tx_buffer(struct netchannel2 *nc,
+                                   struct nc2_tx_buffer *buffer)
+{
+       list_add(&buffer->list, &nc->avail_tx_buffers);
+       nc->nr_avail_tx_buffers++;
+}
+
+/* add a buffer slot to list of unused buffer slots after it has been
+ * returned to other end */
+static inline void free_tx_buffer(struct netchannel2 *nc,
+                                 struct nc2_tx_buffer *buffer)
+{
+       list_add(&buffer->list, &nc->unused_tx_buffer_slots);
+}
+
+#endif /* !NC2_POSTED_BUFFER_H__ */
diff --git a/drivers/xen/netchannel2/posted_buffers.c 
b/drivers/xen/netchannel2/posted_buffers.c
index 96de7da..9fb7570 100644
--- a/drivers/xen/netchannel2/posted_buffers.c
+++ b/drivers/xen/netchannel2/posted_buffers.c
@@ -9,6 +9,7 @@
 #include <xen/live_maps.h>
 #include "netchannel2_endpoint.h"
 #include "netchannel2_core.h"
+#include "posted_buffer.h"
 
 #define POSTED_BUFFER_SIZE PAGE_SIZE
 
@@ -350,17 +351,6 @@ void nc2_handle_set_nr_posted_buffers(struct netchannel2 
*nc,
 
 /* -------------------------- Transmit ------------------------------- */
 
-/* A buffer which the other end has provided us which we can use to
-   transmit packets to it. */
-struct nc2_tx_buffer {
-       struct list_head list;
-       uint32_t id; /* ID assigned by the remote endpoint. */
-       grant_ref_t gref;
-       uint16_t off_in_page;
-       uint16_t size;
-       grant_handle_t grant_handle;
-};
-
 /* A representation of a packet which is halfway through being
    prepared for transmission. */
 struct post_packet_plan {
@@ -373,14 +363,6 @@ struct post_packet_plan {
        volatile struct netchannel2_fragment *output_frag;
 };
 
-/* add a buffer slot to list of unused buffer slots after it has been
- * returned to other end */
-static void free_tx_buffer(struct netchannel2 *nc,
-                          struct nc2_tx_buffer *buffer)
-{
-       list_add(&buffer->list, &nc->unused_tx_buffer_slots);
-}
-
 /* A grant copy failed while we were transmitting a packet.  That
    indicates that the *receiving* domain gave us a bad RX buffer.
    We're too late to send them an error, so there isn't really
diff --git a/drivers/xen/netchannel2/util.c b/drivers/xen/netchannel2/util.c
index 79d9f09..1d96256 100644
--- a/drivers/xen/netchannel2/util.c
+++ b/drivers/xen/netchannel2/util.c
@@ -34,7 +34,13 @@ int allocate_txp_slot(struct netchannel2_ring_pair *ncrp,
 static void nc2_free_skb(struct netchannel2 *nc,
                         struct sk_buff *skb)
 {
-       dev_kfree_skb(skb);
+#ifdef CONFIG_XEN_NETDEV2_VMQ
+       nc2_vmq_t *vmq = &nc->vmq;
+       if (get_skb_overlay(skb)->policy == transmit_policy_vmq)
+               skb_queue_tail(&vmq->dealloc_queue, skb);
+       else
+#endif
+               dev_kfree_skb(skb);
 }
 
 void release_txp_slot(struct netchannel2_ring_pair *ncrp,
diff --git a/drivers/xen/netchannel2/vmq.c b/drivers/xen/netchannel2/vmq.c
new file mode 100644
index 0000000..e36962b
--- /dev/null
+++ b/drivers/xen/netchannel2/vmq.c
@@ -0,0 +1,805 @@
+/*****************************************************************************
+ * vmq.c
+ *
+ * Support multi-queue network devices.
+ *
+ * Copyright (c) 2008, Kaushik Kumar Ram, Rice University.
+ * Copyright (c) 2008, Jose Renato Santos, Hewlett-Packard Co.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+/* This only implements the transmit half of the method; receive is
+ * handled by posted_buffers.c */
+#include <linux/kernel.h>
+#include <linux/netvmq.h>
+#include <linux/skbuff.h>
+#include <xen/xenbus.h>
+#include <xen/balloon.h>
+#include "netchannel2_core.h"
+
+#include "posted_buffer.h"
+#include "vmq.h"
+
+/* state of device queue when operating in vmq mode */
+#define VMQ_QUEUE_DISABLED  0
+#define VMQ_QUEUE_STARTING  1
+#define VMQ_QUEUE_ENABLED   2
+#define VMQ_QUEUE_CLOSING   3
+
+#define VMQ_MAX_UNMAP_OPS 256
+struct vmq_unmap_grants {
+       unsigned n;
+       gnttab_unmap_grant_ref_t gop[VMQ_MAX_UNMAP_OPS];
+};
+typedef struct vmq_unmap_grants vmq_unmap_grants_t;
+
+vmq_unmap_grants_t vmq_unmap_grants;
+
+static inline void vmq_flush_unmap_grants(void)
+{
+       if (vmq_unmap_grants.n == 0)
+               return;
+
+       if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+                                     vmq_unmap_grants.gop,
+                                     vmq_unmap_grants.n))
+               BUG();
+       vmq_unmap_grants.n = 0;
+}
+
+static inline gnttab_unmap_grant_ref_t *vmq_next_unmap_gop(void)
+{
+       if (vmq_unmap_grants.n == VMQ_MAX_UNMAP_OPS)
+               vmq_flush_unmap_grants();
+       return &vmq_unmap_grants.gop[vmq_unmap_grants.n++];
+}
+
+void vmq_flush_unmap_hypercall(void)
+{
+       vmq_flush_unmap_grants();
+}
+
+static inline unsigned long vmq_idx_to_pfn(nc2_vmq_t *vmq, unsigned int idx)
+{
+       return page_to_pfn(vmq->pages[idx]);
+}
+
+static inline unsigned long vmq_idx_to_kaddr(nc2_vmq_t *vmq, unsigned int idx)
+{
+       return (unsigned long)pfn_to_kaddr(vmq_idx_to_pfn(vmq, idx));
+}
+
+/* get vmq idx from page struct         */
+static long nc2_vmq_page_index(struct page *page)
+{
+       nc2_vmq_buf_t *vmq_buf;
+       vmq_buf = (nc2_vmq_buf_t *)page->mapping;
+       return vmq_buf - vmq_buf->nc->vmq.buffer;
+}
+
+/* Read a physical device name from xenstore and
+ * returns a pointer to the associated net_device structure.
+ *  Returns NULL on error. */
+static struct net_device *read_pdev(struct xenbus_device *dev)
+{
+       char *pdevstr;
+       struct net_device *pdev = NULL;
+
+       pdevstr = xenbus_read(XBT_NIL, dev->nodename, "pdev", NULL);
+       if (IS_ERR(pdevstr))
+               return NULL;
+
+       if (pdevstr)
+               pdev = dev_get_by_name(&init_net, pdevstr);
+
+       kfree(pdevstr);
+
+       return pdev;
+}
+
+static void nc2_vmq_page_release(struct page *page, unsigned int order)
+{
+       printk(KERN_CRIT "%s: ERROR: Unexpected release of netchannel2 vmq 
page",
+              __func__);
+       BUG_ON(1);
+}
+
+static inline int nc2_vmq_is_disabled(struct netchannel2 *nc)
+{
+       return nc->vmq.vmq_state == VMQ_QUEUE_DISABLED;
+}
+
+static inline int nc2_vmq_is_starting(struct netchannel2 *nc)
+{
+       return nc->vmq.vmq_state == VMQ_QUEUE_STARTING;
+}
+
+static inline int nc2_vmq_is_enabled(struct netchannel2 *nc)
+{
+       return nc->vmq.vmq_state == VMQ_QUEUE_ENABLED;
+}
+
+static inline int nc2_vmq_is_closing(struct netchannel2 *nc)
+{
+       return nc->vmq.vmq_state == VMQ_QUEUE_CLOSING;
+}
+
+static inline void nc2_vmq_enable(struct netchannel2 *nc)
+{
+       nc2_vmq_t *vmq = &nc->vmq;
+       vmq_get(vmq);
+       vmq_enable_queue(vmq->pdev, vmq->vmq_id);
+       vmq->vmq_state = VMQ_QUEUE_ENABLED;
+}
+
+void nc2_vmq_disconnect(struct netchannel2 *nc)
+{
+       nc2_vmq_t *vmq = &nc->vmq;
+
+       if (nc2_vmq_is_enabled(nc)) {
+               vmq_disable_queue(vmq->pdev, vmq->vmq_id);
+               vmq_free_queue(vmq->pdev, vmq->vmq_id);
+               vmq->vmq_state = VMQ_QUEUE_CLOSING;
+               /* wait until all buffers have been returned by dev driver */
+               wait_event(vmq->waiting_to_free,
+                          atomic_read(&vmq->refcnt) == 0);
+               return;
+       }
+
+       if (nc2_vmq_is_starting(nc)) {
+               vmq_free_queue(vmq->pdev, vmq->vmq_id);
+               vmq->vmq_state = VMQ_QUEUE_CLOSING;
+               return;
+       }
+
+}
+
+
+static void nc2_vmq_end_map_buffers(gnttab_map_grant_ref_t *mop, int count,
+                                   struct netchannel2 *nc, u16 *alloc_idx)
+{
+       int i, err;
+       u16 idx;
+       unsigned int prod;
+       nc2_vmq_t *vmq = &nc->vmq;
+
+       prod = vmq->mapped_pages_prod;
+
+       for (i = 0; i < count; i++) {
+               idx = alloc_idx[i];
+
+               /* Check error status */
+               err = mop->status;
+               if (likely(!err)) {
+                       set_phys_to_machine(
+                                           __pa(vmq_idx_to_kaddr(vmq, idx))
+                                           >> PAGE_SHIFT,
+                                           FOREIGN_FRAME(mop->dev_bus_addr
+                                                         >> PAGE_SHIFT));
+                       /* Store the handle */
+                       vmq->buffer[idx].buf->grant_handle = mop->handle;
+
+                       /* Add it to the mapped pages list */
+                       vmq->mapped_pages[VMQ_IDX_MASK(prod++)] = idx;
+                       mop++;
+                       continue;
+               }
+
+               /* Error mapping page: return posted buffer to other end.
+                * TODO: We might need an error field on the return buffer
+                * message */
+               return_tx_buffer(nc, vmq->buffer[idx].buf);
+
+               /* Add the page back to the free list */
+               vmq->unmapped_pages[VMQ_IDX_MASK(vmq->unmapped_pages_prod++)]
+                       = idx;
+
+               mop++;
+       }
+
+       smp_wmb();
+       vmq->mapped_pages_prod = prod;
+
+       return;
+}
+
+/* Map guest buffers and place them in the mapped buffers list. The mapped
+ * pages in this list are used when allocating a skb (vmq_alloc_skb()).
+ */
+static void nc2_vmq_map_buffers(struct netchannel2 *nc)
+{
+       u16 idx;
+       int count = 0;
+       unsigned int cons;
+       int nbufs;
+       int buf_avail;
+       struct nc2_tx_buffer *buf;
+       struct nc2_vmq *vmq = &nc->vmq;
+       int n_mapped = nr_vmq_bufs(nc);
+
+
+       /*
+        * Putting hundreds of bytes on the stack is considered rude.
+        * Static works because a tasklet can only be on one CPU at any time.
+        */
+       static gnttab_map_grant_ref_t rx_map_ops[VMQ_MAX_BUFFERS];
+       static u16 alloc_idx[VMQ_MAX_BUFFERS];
+
+       /* If there is at least VMQ_MIN_BUFFERS buffers, no work to do */
+       if (n_mapped >= VMQ_MIN_BUFFERS)
+               return;
+
+       /* Try to get VMQ_MAX_BUFFERS mapped buffers, if there are
+          sufficient buffers posted by the other end  */
+       nbufs = VMQ_MAX_BUFFERS - n_mapped;
+       buf_avail = nc->nr_avail_tx_buffers;
+       if (nbufs > buf_avail)
+               nbufs = buf_avail;
+
+       /* Xen cannot handle more than 512 grant ops in a single hypercall */
+       if (nbufs > 512)
+               nbufs = 512;
+
+       /* give up if there are no buffers available */
+       if (nbufs <= 0)
+               return;
+
+       /* Note that we *should* have free pages to consume here
+        * and no checks are needed.
+        */
+       cons = vmq->unmapped_pages_cons;
+
+       while (count < nbufs) {
+               idx = vmq->unmapped_pages[VMQ_IDX_MASK(cons++)];
+               buf = vmq->buffer[idx].buf = _get_tx_buffer(nc);
+               /* Setup grant map operation */
+               gnttab_set_map_op(&rx_map_ops[count],
+                                 vmq_idx_to_kaddr(vmq, idx),
+                                 GNTMAP_host_map,
+                                 buf->gref,
+                                 nc->rings.otherend_id);
+               alloc_idx[count] = idx;
+               count++;
+       }
+
+       vmq->unmapped_pages_cons = cons;
+
+       /* Map all the pages */
+       BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+                                        rx_map_ops, nbufs));
+
+       /* Finalize buffer mapping after checking if the grant operations
+          succeeded */
+       nc2_vmq_end_map_buffers(rx_map_ops, nbufs, nc, alloc_idx);
+
+       vmq->nbufs += nbufs;
+}
+
+static void nc2_vmq_unmap_buf(struct netchannel2 *nc,
+                             unsigned int idx, int recycle)
+{
+       nc2_vmq_t *vmq = &nc->vmq;
+       unsigned long pfn;
+       gnttab_unmap_grant_ref_t *gop;
+       unsigned prod;
+
+       pfn = vmq_idx_to_pfn(vmq, idx);
+       /* Already unmapped? */
+       if (!phys_to_machine_mapping_valid(pfn))
+               return;
+
+       gop = vmq_next_unmap_gop();
+       gnttab_set_unmap_op(gop, vmq_idx_to_kaddr(vmq, idx),
+                           GNTMAP_host_map,
+                           vmq->buffer[idx].buf->grant_handle);
+
+       vmq->nbufs--;
+
+       set_phys_to_machine(__pa(vmq_idx_to_kaddr(vmq, idx)) >>
+                           PAGE_SHIFT,
+                           INVALID_P2M_ENTRY);
+       /* Ready for next use. */
+       gnttab_reset_grant_page(vmq->pages[idx]);
+       /* Add the page back to the unmapped list */
+       prod = vmq->unmapped_pages_prod;
+       vmq->unmapped_pages[VMQ_IDX_MASK(prod++)] = idx;
+       if (recycle)
+               recycle_tx_buffer(nc, vmq->buffer[idx].buf);
+       else
+               free_tx_buffer(nc, vmq->buffer[idx].buf);
+       smp_wmb();
+       vmq->unmapped_pages_prod = prod;
+}
+
+static void nc2_vmq_free_mapped_bufs(struct netchannel2 *nc)
+{
+       nc2_vmq_t *vmq = &nc->vmq;
+       unsigned int idx;
+       unsigned prod, cons;
+
+       /* The queue should be disabled before this function is called */
+       BUG_ON(vmq->vmq_state == VMQ_QUEUE_ENABLED);
+
+       cons = vmq->mapped_pages_cons;
+       prod = vmq->mapped_pages_prod;
+       smp_rmb();
+
+       while (cons != prod) {
+               idx = vmq->mapped_pages[VMQ_IDX_MASK(cons++)];
+               nc2_vmq_unmap_buf(nc, idx, 1);
+       }
+
+       vmq_flush_unmap_grants();
+
+       vmq->mapped_pages_cons = cons;
+
+}
+
+static void nc2_vmq_free_skb(struct sk_buff *skb)
+{
+       struct netchannel2 *nc;
+       nc2_vmq_t *vmq;
+       unsigned int idx;
+       int nr_frags, i;
+       struct skb_shared_info *shinfo = skb_shinfo(skb);
+       skb_frag_t *frags = shinfo->frags;
+
+       nc = netdev_priv(skb->dev);
+       vmq = &nc->vmq;
+
+       nr_frags = shinfo->nr_frags;
+       for (i = 0; i < nr_frags; i++) {
+               idx = nc2_vmq_page_index(frags[i].page);
+               nc2_vmq_unmap_buf(nc, idx, 1);
+       }
+
+       vmq_flush_unmap_grants();
+
+       shinfo->frag_list = NULL;
+       shinfo->nr_frags = 0;
+
+       /* Add the skb back to the free pool */
+       skb_queue_tail(&vmq->free_skb_list, skb);
+}
+
+/* Initialize the free socket buffer list */
+static int vmq_init_free_skb_list(int n, struct sk_buff_head *free_skb_list)
+{
+       int i;
+       struct sk_buff *skb;
+
+       skb_queue_head_init(free_skb_list);
+
+       for (i = 0; i < n; i++) {
+               skb = alloc_skb(VMQ_SKB_SIZE, GFP_ATOMIC);
+               if (!skb) {
+                       printk("Netchannel2 vmq: Failed to allocate socket "
+                              "buffer %d (max=%d)\n", i, (int)n);
+                       goto error;
+               }
+               skb_queue_tail(free_skb_list, skb);
+       }
+
+       return 0;
+error:
+       /* Free all the allocated buffers and return Error */
+       while (!skb_queue_empty(free_skb_list))
+               kfree_skb(skb_dequeue(free_skb_list));
+
+       return -1;
+}
+
+/* Initialize vmq. Return 1 if vmq is used and 0 otherwise */
+int nc2_vmq_connect(struct netchannel2 *nc)
+{
+       nc2_vmq_t *vmq = &nc->vmq;
+       struct page *page;
+       int q_id;
+       int size;
+       int i;
+
+       vmq->vmq_mode = 0;
+       vmq->pdev = read_pdev(nc->xenbus_device);
+
+       /* cannot use vmq mode if physical device not found */
+       if (!vmq->pdev)
+               return 0;
+
+       /* Allocate a RX queue */
+       q_id = vmq_alloc_queue(vmq->pdev, VMQ_TYPE_RX);
+       if (q_id < 0)
+               /* Allocation failed, cannot use multi-queue */
+               goto free_pdev;
+
+       vmq->vmq_id = q_id;
+
+       /* Set the size of the queue */
+       size = vmq_get_maxsize(vmq->pdev);
+       if (size > VMQ_QUEUE_SIZE)
+               size = VMQ_QUEUE_SIZE;
+       if (vmq_set_size(vmq->pdev, q_id, size) < 0) {
+               /* Failure, free up the queue and return error */
+               printk(KERN_ERR "%s: could not set queue size on net device\n",
+                      __func__);
+               goto free_queue;
+       }
+       vmq->vmq_size = size;
+
+       /* Set the mac address of the queue */
+       if (vmq_set_mac(vmq->pdev, q_id, nc->rings.remote_mac) < 0) {
+               /* Failure, free up the queue and return error */
+               printk(KERN_ERR "%s: could not set MAC address for net device 
queue\n",
+                      __func__);
+               goto free_queue;
+       }
+
+       vmq->pages = alloc_empty_pages_and_pagevec(VMQ_MAX_BUFFERS);
+       if (vmq->pages == NULL) {
+               printk(KERN_ERR "%s: out of memory\n", __func__);
+               goto free_queue;
+       }
+
+       skb_queue_head_init(&vmq->dealloc_queue);
+       skb_queue_head_init(&vmq->rx_queue);
+
+       if (vmq_init_free_skb_list(VMQ_MAX_BUFFERS,
+                                  &vmq->free_skb_list)) {
+               printk(KERN_ERR "%s: Could not allocate free socket buffers",
+                       __func__);
+               goto free_pagevec;
+       }
+
+       for (i = 0; i < VMQ_MAX_BUFFERS; i++) {
+               vmq->buffer[i].nc = nc;
+               page = vmq->pages[i];
+               SetPageForeign(page, nc2_vmq_page_release);
+               page->mapping = (void *)&vmq->buffer[i];
+               vmq->unmapped_pages[i] = i;
+       }
+
+       vmq->unmapped_pages_prod = VMQ_MAX_BUFFERS;
+       vmq->unmapped_pages_cons = 0;
+
+       vmq->mapped_pages_prod = 0;
+       vmq->mapped_pages_cons = 0;
+
+       vmq->nbufs = 0;
+       vmq->vmq_mode = 1;
+
+       /* Store the pointer to netchannel2 device in pdev */
+       BUG_ON((vmq->pdev->vmq == NULL) || (vmq->pdev->vmq->queue == NULL));
+       vmq->pdev->vmq->queue[q_id].guest = (void *)nc->net_device;
+
+       atomic_set(&vmq->refcnt, 0);
+       init_waitqueue_head(&vmq->waiting_to_free);
+
+       printk(KERN_INFO "Netchannel2 using vmq mode for guest %d\n",
+              nc->xenbus_device->otherend_id);
+
+       vmq->vmq_state = VMQ_QUEUE_STARTING;
+
+       return 1;       /* Success */
+
+
+free_pagevec:
+       free_empty_pages_and_pagevec(vmq->pages, VMQ_MAX_BUFFERS);
+free_queue:
+       vmq_free_queue(vmq->pdev, vmq->vmq_id);
+free_pdev:
+       dev_put(vmq->pdev);
+       vmq->pdev = NULL;
+       return 0;
+}
+
+void nc2_vmq_shutdown(struct netchannel2 *nc)
+{
+       nc2_vmq_t *vmq = &nc->vmq;
+       int i;
+
+       if (!vmq->vmq_mode)
+               return;
+
+       /* All posted bufs should have been returned */
+       BUG_ON(nr_vmq_bufs(nc) != nr_vmq_mapped_bufs(nc));
+
+       /* free the mapped bufs */
+       nc2_vmq_free_mapped_bufs(nc);
+
+       /* Free the vmq pages */
+       if (vmq->pages) {
+               for (i = 0; i < VMQ_MAX_BUFFERS; i++) {
+                       if (PageForeign(vmq->pages[i]))
+                               ClearPageForeign(vmq->pages[i]);
+                       vmq->pages[i]->mapping = NULL;
+               }
+               free_empty_pages_and_pagevec(vmq->pages, VMQ_MAX_BUFFERS);
+               vmq->pages = NULL;
+       }
+
+       while (!skb_queue_empty(&vmq->free_skb_list)) {
+               /* Free the socket buffer pool */
+               kfree_skb(skb_dequeue(&vmq->free_skb_list));
+       }
+       vmq->vmq_state = VMQ_QUEUE_DISABLED;
+       vmq->vmq_mode = 0;
+
+       if (vmq->pdev) {
+               dev_put(vmq->pdev);
+               vmq->pdev = NULL;
+       }
+
+       vmq_put(vmq);
+}
+
+static int prepare_xmit_allocate_vmq(struct netchannel2 *nc,
+                                    struct sk_buff *skb)
+{
+       unsigned msg_size;
+
+       msg_size = get_transmitted_packet_msg_size(skb);
+       if (!nc2_reserve_payload_bytes(&nc->rings.prod_ring, msg_size))
+               return -1;
+       return 0;
+}
+
+void do_vmq_work(struct netchannel2 *nc)
+{
+       nc2_vmq_t *vmq = &nc->vmq;
+       struct sk_buff *skb;
+       unsigned long flags;
+
+       /* if not in vmq mode do nothing */
+       if (!nc2_in_vmq_mode(nc))
+               return;
+
+       /* Map guest buffers for dedicated NIC RX queue if needed */
+       if (nr_vmq_bufs(nc) < VMQ_MIN_BUFFERS) {
+               nc2_vmq_map_buffers(nc);
+               /* We delay enabling the queue until we have enough
+                  posted buffers. Check if it is time to enable it */
+               if (nc2_vmq_is_starting(nc) &&
+                   (nr_vmq_bufs(nc) >= VMQ_MIN_BUFFERS)) {
+                       nc2_vmq_enable(nc);
+               }
+       }
+
+       /* free vmq skb's returned by the physical device driver */
+       while (!skb_queue_empty(&nc->vmq.dealloc_queue))
+               nc2_vmq_free_skb(skb_dequeue(&nc->vmq.dealloc_queue));
+
+       /* complete vmq closing after all packets returned by physical
+        * device driver */
+
+       if (nc2_vmq_is_closing(nc) &&
+           (nr_vmq_bufs(nc) == nr_vmq_mapped_bufs(nc))) {
+               nc->vmq.vmq_state = VMQ_QUEUE_DISABLED;
+               nc2_vmq_shutdown(nc);
+       }
+
+       spin_lock_irqsave(&vmq->rx_queue.lock, flags);
+       while (!skb_queue_empty(&vmq->rx_queue)) {
+               skb = __skb_dequeue(&nc->vmq.rx_queue);
+               if (prepare_xmit_allocate_vmq(nc, skb) < 0) {
+                       __skb_queue_head(&vmq->rx_queue, skb);
+                       spin_unlock_irqrestore(&vmq->rx_queue.lock, flags);
+                       return;
+               }
+               __skb_queue_tail(&nc->rings.pending_tx_queue, skb);
+       }
+       spin_unlock_irqrestore(&vmq->rx_queue.lock, flags);
+}
+
+/* Return the netchannel2 device corresponding to the given queue in pdev */
+static inline struct net_device *nc2_vmq_queue_to_vif(struct net_device *pdev,
+                                                     int queue_id)
+{
+       net_vmq_t *n_vmq;
+       vmq_queue_t *vmq_q;
+
+       n_vmq = pdev->vmq;
+       BUG_ON(n_vmq == NULL);
+       vmq_q = &n_vmq->queue[queue_id];
+       BUG_ON(vmq_q == NULL);
+
+       return (struct net_device *)vmq_q->guest;
+}
+
+/* Handle incoming vmq packet */
+int vmq_netif_rx(struct sk_buff *skb, int queue_id)
+{
+       struct skb_cb_overlay *skb_co = get_skb_overlay(skb);
+       struct net_device *dev;
+       struct netchannel2 *nc;
+       nc2_vmq_t *vmq;
+
+       memset(skb_co, 0, sizeof(*skb_co));
+
+       skb_co->nr_fragments = skb_shinfo(skb)->nr_frags;
+       skb_co->type = NC2_PACKET_TYPE_pre_posted;
+       skb_co->policy = transmit_policy_vmq;
+
+       /* get the netchannel2 interface corresponding to this queue */
+       dev = nc2_vmq_queue_to_vif(skb->dev, queue_id);
+       nc = netdev_priv(dev);
+       vmq = &nc->vmq;
+
+       /* replace source dev with destination dev */
+       skb->dev = dev;
+       /* add skb to rx_queue */
+       skb_queue_tail(&vmq->rx_queue, skb);
+
+       /* Trigger thread excution to procees new packets */
+       nc2_kick(&nc->rings);
+
+       return 0;
+}
+EXPORT_SYMBOL(vmq_netif_rx);
+
+
+/* Allocate a socket buffer from the free list, get a guest posted
+ * buffer, attach it to the skb, and return it.
+ */
+struct sk_buff *vmq_alloc_skb(struct net_device *netdevice, int queue_id,
+                             unsigned int length)
+{
+       struct sk_buff *skb;
+       struct netchannel2 *nc;
+       nc2_vmq_t *vmq;
+       unsigned int idx;
+       int nr_bufs, i;
+       unsigned int cons;
+       unsigned int prod;
+
+       /* get the netchannel2 interface corresponding to this queue */
+       nc = netdev_priv(nc2_vmq_queue_to_vif(netdevice, queue_id));
+
+       vmq = &nc->vmq;
+
+       /* Get a free buffer from the pool */
+       if (skb_queue_empty(&vmq->free_skb_list)) {
+               /* No buffers to allocate */
+               return NULL;
+       }
+
+
+       skb = skb_dequeue(&vmq->free_skb_list);
+       BUG_ON(skb == NULL);
+
+       nr_bufs = VMQ_NUM_BUFFERS(length);
+
+       cons = vmq->mapped_pages_cons;
+       prod = vmq->mapped_pages_prod;
+       smp_rmb();
+
+       if (nr_bufs > (prod - cons))
+               /* Not enough mapped buffers in the pool */
+               goto kick_nc2;
+
+       if (nr_bufs > MAX_SKB_FRAGS)
+               goto error;
+
+       for (i = 0; i < nr_bufs; i++) {
+               idx = vmq->mapped_pages[VMQ_IDX_MASK(cons)];
+               /* FIX ME: This can be simplified */
+               skb_shinfo(skb)->frags[i].page =
+                       virt_to_page(vmq_idx_to_kaddr(vmq, idx));
+               skb_shinfo(skb)->frags[i].page_offset = 0;
+               skb_shinfo(skb)->frags[i].size = PAGE_SIZE;
+               skb_shinfo(skb)->nr_frags++;
+               skb->dev = netdevice;
+               cons++;
+       }
+
+       vmq->mapped_pages_cons = cons;
+
+       /* if number of buffers get low run tasklet to map more buffers */
+       if (nr_vmq_bufs(nc)  < VMQ_MIN_BUFFERS)
+               nc2_kick(&nc->rings);
+
+       return skb;
+
+kick_nc2:
+       /* kick netchannel2 interface to get any recently posted buffers */
+       nc2_kick(&nc->rings);
+error:
+       /* Add the skb back to the free pool */
+       skb_queue_tail(&vmq->free_skb_list, skb);
+       return NULL;
+}
+EXPORT_SYMBOL(vmq_alloc_skb);
+
+/* Detach the guest pages and free the socket buffer */
+void vmq_free_skb(struct sk_buff *skb, int queue_id)
+{
+       struct net_device *dev;
+       struct netchannel2 *nc;
+       nc2_vmq_t *vmq;
+
+       /* get the netchannel2 interface corresponding to this queue */
+       dev = nc2_vmq_queue_to_vif(skb->dev, queue_id);
+
+       nc = netdev_priv(dev);
+       vmq = &nc->vmq;
+
+       /* Add skb to the dealloc queue */
+       skb->dev = dev;
+       skb_queue_tail(&vmq->dealloc_queue, skb);
+
+       /* kick netchannel2 interface  */
+       nc2_kick(&nc->rings);
+
+}
+EXPORT_SYMBOL(vmq_free_skb);
+
+int nc2_is_vmq_packet(struct netchannel2 *nc, struct sk_buff *skb)
+{
+       int nr_frags;
+       long idx;
+       nc2_vmq_t *vmq = &nc->vmq;
+
+       nr_frags = skb_shinfo(skb)->nr_frags;
+       if (vmq->vmq_mode && nr_frags &&
+           PageForeign(skb_shinfo(skb)->frags[0].page)) {
+               idx = nc2_vmq_page_index(skb_shinfo(skb)->frags[0].page);
+               if ((idx >= 0) && (idx < VMQ_MAX_BUFFERS))
+                       return 1;
+       }
+
+       return 0;
+}
+
+/* Prepare to transmit a vmq packet */
+void xmit_vmq(struct netchannel2 *nc, struct sk_buff *skb,
+             volatile void *msg_buf)
+{
+       volatile struct netchannel2_msg_packet *msg = msg_buf;
+       volatile struct netchannel2_fragment *out_frag;
+       nc2_vmq_t *vmq = &nc->vmq;
+       skb_frag_t *frag;
+       struct nc2_tx_buffer *txbuf;
+       int nr_frags;
+       unsigned int idx;
+       unsigned x;
+
+       nr_frags = skb_shinfo(skb)->nr_frags;
+       for (x = 0; x < nr_frags; x++) {
+               frag = &skb_shinfo(skb)->frags[x];
+               out_frag = &msg->frags[x];
+
+               idx = nc2_vmq_page_index(frag->page);
+               txbuf = vmq->buffer[idx].buf;
+               out_frag->pre_post.id = txbuf->id;
+               out_frag->off  = frag->page_offset;
+               out_frag->size = frag->size;
+               /* TODO: need to batch unmap grants */
+               nc2_vmq_unmap_buf(nc, idx, 0);
+       }
+
+       /* Avoid unmapping frags grants when skb is freed later */
+       /* by nc2_vmq_free_skb() */
+       skb_shinfo(skb)->nr_frags = 0;
+}
+
diff --git a/drivers/xen/netchannel2/vmq.h b/drivers/xen/netchannel2/vmq.h
new file mode 100644
index 0000000..fa1cc8a
--- /dev/null
+++ b/drivers/xen/netchannel2/vmq.h
@@ -0,0 +1,58 @@
+#ifndef VMQ_H__
+#define VMQ_H__
+
+#include "netchannel2_core.h"
+
+#ifdef CONFIG_XEN_NETDEV2_VMQ
+
+int nc2_vmq_connect(struct netchannel2 *nc);
+void nc2_vmq_disconnect(struct netchannel2 *nc);
+void do_vmq_work(struct netchannel2 *nc);
+int nc2_is_vmq_packet(struct netchannel2 *nc, struct sk_buff *skb);
+void xmit_vmq(struct netchannel2 *nc, struct sk_buff *skb,
+             volatile void *msg);
+void vmq_flush_unmap_hypercall(void);
+
+#define vmq_get(_b)                                            \
+               atomic_inc(&(_b)->refcnt);
+
+#define vmq_put(_b)                                            \
+       do {                                                    \
+               if (atomic_dec_and_test(&(_b)->refcnt)) {       \
+                       wake_up(&(_b)->waiting_to_free);        \
+               }                                               \
+       } while (0)
+
+static inline int nr_vmq_mapped_bufs(struct netchannel2 *nc)
+{
+       return nc->vmq.mapped_pages_prod -
+               nc->vmq.mapped_pages_cons;
+}
+
+static inline int nr_vmq_bufs(struct netchannel2 *nc)
+{
+       return nc->vmq.nbufs;
+}
+
+static inline int nc2_in_vmq_mode(struct netchannel2 *nc)
+{
+       return nc->vmq.vmq_mode;
+}
+
+#else
+static inline int nc2_vmq_connect(struct netchannel2 *nc)
+{
+       return 0;
+}
+static inline void nc2_vmq_disconnect(struct netchannel2 *nc)
+{
+}
+static inline void do_vmq_work(struct netchannel2 *nc)
+{
+}
+static inline void vmq_flush_unmap_hypercall(void)
+{
+}
+#endif /* CONFIG_XEN_NETDEV2_VMQ */
+
+#endif /* !VMQ_H__ */
diff --git a/drivers/xen/netchannel2/vmq_def.h 
b/drivers/xen/netchannel2/vmq_def.h
new file mode 100644
index 0000000..60f1ccb
--- /dev/null
+++ b/drivers/xen/netchannel2/vmq_def.h
@@ -0,0 +1,68 @@
+#ifndef VMQ_DEF_H__
+#define VMQ_DEF_H__
+
+
+/* size of HW queue in VMQ device */
+#define VMQ_QUEUE_SIZE 1024
+
+/* Mimimum amount of buffers needed for VMQ
+ * This is the lower water mark that triggers mapping more guest buffers
+ * Should be larger than the queue size to allow for in flight packets
+ */
+#define VMQ_MIN_BUFFERS 1920
+
+/* Maximum amount of posted buffers which are reserved for VMQ
+ * Should be less than MAX_POSTED_BUFFERS. For now, the difference can be used
+ * for intra-node guest to guest traffic. When we map guest buffers we try to
+ * have VMQ_MAX_BUFFERS mapped. The difference 
(VMQ_MAX_BUFFERS-VMQ_MIN_BUFFERS)
+ * helps batch multiple grant map operattions
+ * VMQ_QUEUE_SIZE < VMQ_MIN_BUFFER < VMQ_MAX_BUFFER < MAX_POSTED_BUFFERS
+ * VMQ_MAX_BUFFERS must be a power of 2
+ */
+#define VMQ_MAX_BUFFERS 2048
+
+/* skb size is zero since packet data uses fragments */
+#define VMQ_SKB_SIZE 0
+
+#define VMQ_NUM_BUFFERS(len) ((len + PAGE_SIZE - 1) / PAGE_SIZE)
+
+#define VMQ_IDX_MASK(_i) ((_i)&(VMQ_MAX_BUFFERS-1))
+
+typedef struct nc2_vmq_buf {
+       struct nc2_tx_buffer *buf;
+       struct netchannel2   *nc;
+} nc2_vmq_buf_t;
+
+typedef struct nc2_vmq {
+       struct net_device *pdev;        /* Pointer to physical device */
+       int vmq_mode;                   /* indicate if vif is in vmq mode   */
+       struct page **pages;            /* pages for mapping guest RX bufs  */
+       struct sk_buff_head free_skb_list;     /* Free socket buffer pool   */
+       struct sk_buff_head dealloc_queue;     /* list of skb's to be free  */
+       struct sk_buff_head rx_queue;          /* list of received packets  */
+
+       /* guest mapped buffers */
+       nc2_vmq_buf_t buffer[VMQ_MAX_BUFFERS];
+
+       /* Ring with free pages available for mapping guest RX buffers */
+       u16 unmapped_pages[VMQ_MAX_BUFFERS];
+       unsigned int unmapped_pages_prod;
+       unsigned int unmapped_pages_cons;
+
+       /* Ring of mapped RX  pages avaialable for vmq device */
+       u16 mapped_pages[VMQ_MAX_BUFFERS];
+       unsigned int mapped_pages_prod;
+       unsigned int mapped_pages_cons;
+
+       unsigned int nbufs;           /* number of vmq buffers: posted to   */
+                                     /* HW queue or available to be posted */
+       int vmq_id;                   /* Queue id    */
+       int vmq_size;                 /* Queue size  */
+       int vmq_state;                /* queue stste */
+
+       atomic_t         refcnt;
+       wait_queue_head_t waiting_to_free;
+
+} nc2_vmq_t;
+
+#endif /* !VMQ_DEF_H__ */
diff --git a/drivers/xen/netchannel2/xmit_packet.c 
b/drivers/xen/netchannel2/xmit_packet.c
index 1a879aa..09827fc 100644
--- a/drivers/xen/netchannel2/xmit_packet.c
+++ b/drivers/xen/netchannel2/xmit_packet.c
@@ -3,6 +3,7 @@
 #include <linux/kernel.h>
 #include <linux/version.h>
 #include "netchannel2_core.h"
+#include "vmq.h"
 
 /* You don't normally want to transmit in posted buffers mode, because
    grant mode is usually faster, but it's sometimes useful for testing
@@ -189,6 +190,11 @@ int nc2_really_start_xmit(struct netchannel2_ring_pair 
*ncrp,
        set_offload_flags(skb, msg);
 
        switch (skb_co->policy) {
+#ifdef CONFIG_XEN_NETDEV2_VMQ
+       case transmit_policy_vmq:
+               xmit_vmq(nc, skb, msg);
+               break;
+#endif
        case transmit_policy_small:
                /* Nothing to do */
                break;
-- 
1.6.3.1


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
WARNING - OLD ARCHIVES

xen-devel

[Xen-devel] [PATCH 21/22] NC2 VMQ support.